Total coverage: 181800 (15%)of 1239390
95 95 1645 1645 26 26 26 26 22 26 26 26 26 146 374 390 17 374 1605 1604 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/fs.h> #include <linux/path.h> #include <linux/slab.h> #include <linux/fs_struct.h> #include "internal.h" /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. * It can block. */ void set_fs_root(struct fs_struct *fs, const struct path *path) { struct path old_root; path_get(path); write_seqlock(&fs->seq); old_root = fs->root; fs->root = *path; write_sequnlock(&fs->seq); if (old_root.dentry) path_put(&old_root); } /* * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. * It can block. */ void set_fs_pwd(struct fs_struct *fs, const struct path *path) { struct path old_pwd; path_get(path); write_seqlock(&fs->seq); old_pwd = fs->pwd; fs->pwd = *path; write_sequnlock(&fs->seq); if (old_pwd.dentry) path_put(&old_pwd); } static inline int replace_path(struct path *p, const struct path *old, const struct path *new) { if (likely(p->dentry != old->dentry || p->mnt != old->mnt)) return 0; *p = *new; return 1; } void chroot_fs_refs(const struct path *old_root, const struct path *new_root) { struct task_struct *g, *p; struct fs_struct *fs; int count = 0; read_lock(&tasklist_lock); for_each_process_thread(g, p) { task_lock(p); fs = p->fs; if (fs) { int hits = 0; write_seqlock(&fs->seq); hits += replace_path(&fs->root, old_root, new_root); hits += replace_path(&fs->pwd, old_root, new_root); while (hits--) { count++; path_get(new_root); } write_sequnlock(&fs->seq); } task_unlock(p); } read_unlock(&tasklist_lock); while (count--) path_put(old_root); } void free_fs_struct(struct fs_struct *fs) { path_put(&fs->root); path_put(&fs->pwd); kmem_cache_free(fs_cachep, fs); } void exit_fs(struct task_struct *tsk) { struct fs_struct *fs = tsk->fs; if (fs) { int kill; task_lock(tsk); read_seqlock_excl(&fs->seq); tsk->fs = NULL; kill = !--fs->users; read_sequnlock_excl(&fs->seq); task_unlock(tsk); if (kill) free_fs_struct(fs); } } struct fs_struct *copy_fs_struct(struct fs_struct *old) { struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); /* We don't need to lock fs - think why ;-) */ if (fs) { fs->users = 1; fs->in_exec = 0; seqlock_init(&fs->seq); fs->umask = old->umask; read_seqlock_excl(&old->seq); fs->root = old->root; path_get(&fs->root); fs->pwd = old->pwd; path_get(&fs->pwd); read_sequnlock_excl(&old->seq); } return fs; } int unshare_fs_struct(void) { struct fs_struct *fs = current->fs; struct fs_struct *new_fs = copy_fs_struct(fs); int kill; if (!new_fs) return -ENOMEM; task_lock(current); read_seqlock_excl(&fs->seq); kill = !--fs->users; current->fs = new_fs; read_sequnlock_excl(&fs->seq); task_unlock(current); if (kill) free_fs_struct(fs); return 0; } EXPORT_SYMBOL_GPL(unshare_fs_struct); /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, .seq = __SEQLOCK_UNLOCKED(init_fs.seq), .umask = 0022, };
196 10 455 80 39 258 74 243 91 389 215 20 425 89 60 43 447 458 70 497 70 497 497 497 17 216 212 213 168 168 406 176 496 497 343 497 216 3 215 216 214 27 204 167 88 49 26 7 21 12 11 4 11 120 3 120 107 4 24 114 94 20 225 2 225 83 12 224 83 5 81 2 168 4 168 133 27 132 497 361 142 216 213 10 45 178 168 168 49 168 182 3 182 164 18 296 15 15 15 15 15 15 15 15 15 13 15 141 362 124 1 275 275 272 225 271 223 53 271 2 2 450 449 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 /* * net/tipc/name_table.c: TIPC name table code * * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2008, 2010-2014, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/sock.h> #include <linux/list_sort.h> #include <linux/rbtree_augmented.h> #include "core.h" #include "netlink.h" #include "name_table.h" #include "name_distr.h" #include "subscr.h" #include "bcast.h" #include "addr.h" #include "node.h" #include "group.h" /** * struct service_range - container for all bindings of a service range * @lower: service range lower bound * @upper: service range upper bound * @tree_node: member of service range RB tree * @max: largest 'upper' in this node subtree * @local_publ: list of identical publications made from this node * Used by closest_first lookup and multicast lookup algorithm * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm */ struct service_range { u32 lower; u32 upper; struct rb_node tree_node; u32 max; struct list_head local_publ; struct list_head all_publ; }; /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type * @lock: spinlock controlling access to pertaining service ranges/publications * @rcu: RCU callback head used for deferred freeing */ struct tipc_service { u32 type; u32 publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; spinlock_t lock; /* Covers service range list */ struct rcu_head rcu; }; #define service_range_upper(sr) ((sr)->upper) RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, struct service_range, tree_node, u32, max, service_range_upper) #define service_range_entry(rbtree_node) \ (container_of(rbtree_node, struct service_range, tree_node)) #define service_range_overlap(sr, start, end) \ ((sr)->lower <= (end) && (sr)->upper >= (start)) /** * service_range_foreach_match - iterate over tipc service rbtree for each * range match * @sr: the service range pointer as a loop cursor * @sc: the pointer to tipc service which holds the service range rbtree * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching */ #define service_range_foreach_match(sr, sc, start, end) \ for (sr = service_range_match_first((sc)->ranges.rb_node, \ start, \ end); \ sr; \ sr = service_range_match_next(&(sr)->tree_node, \ start, \ end)) /** * service_range_match_first - find first service range matching a range * @n: the root node of service range rbtree for searching * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the leftmost service range node in the rbtree that overlaps the * specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_first(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *l, *r; /* Non overlaps in tree at all? */ if (!n || service_range_entry(n)->max < start) return NULL; while (n) { l = n->rb_left; if (l && service_range_entry(l)->max >= start) { /* A leftmost overlap range node must be one in the left * subtree. If not, it has lower > end, then nodes on * the right side cannot satisfy the condition either. */ n = l; continue; } /* No one in the left subtree can match, return if this node is * an overlap i.e. leftmost. */ sr = service_range_entry(n); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup on the right side */ r = n->rb_right; if (sr->lower <= end && r && service_range_entry(r)->max >= start) { n = r; continue; } break; } return NULL; } /** * service_range_match_next - find next service range matching a range * @n: a node in service range rbtree from which the searching starts * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the next service range node to the given node in the rbtree that * overlaps the specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_next(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *p, *r; while (n) { r = n->rb_right; if (r && service_range_entry(r)->max >= start) /* A next overlap range node must be one in the right * subtree. If not, it has lower > end, then any next * successor (- an ancestor) of this node cannot * satisfy the condition either. */ return service_range_match_first(r, start, end); /* No one in the right subtree can match, go up to find an * ancestor of this node which is parent of a left-hand child. */ while ((p = rb_parent(n)) && n == p->rb_right) n = p; if (!p) break; /* Return if this ancestor is an overlap */ sr = service_range_entry(p); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup more from this ancestor */ if (sr->lower <= end) { n = p; continue; } break; } return NULL; } static int hash(int x) { return x & (TIPC_NAMETBL_SIZE - 1); } /** * tipc_publ_create - create a publication structure * @ua: the service range the user is binding to * @sk: the address of the socket that is bound * @key: publication key */ static struct publication *tipc_publ_create(struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct publication *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return NULL; p->sr = ua->sr; p->sk = *sk; p->scope = ua->scope; p->key = key; INIT_LIST_HEAD(&p->binding_sock); INIT_LIST_HEAD(&p->binding_node); INIT_LIST_HEAD(&p->local_publ); INIT_LIST_HEAD(&p->all_publ); INIT_LIST_HEAD(&p->list); return p; } /** * tipc_service_create - create a service structure for the specified 'type' * @net: network namespace * @ua: address representing the service to be bound * * Allocates a single range structure and sets it to all 0's. */ static struct tipc_service *tipc_service_create(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct tipc_service *service; struct hlist_head *hd; service = kzalloc(sizeof(*service), GFP_ATOMIC); if (!service) { pr_warn("Service creation failed, no memory\n"); return NULL; } spin_lock_init(&service->lock); service->type = ua->sr.type; service->ranges = RB_ROOT; INIT_HLIST_NODE(&service->service_list); INIT_LIST_HEAD(&service->subscriptions); hd = &nt->services[hash(ua->sr.type)]; hlist_add_head_rcu(&service->service_list, hd); return service; } /* tipc_service_find_range - find service range matching publication parameters */ static struct service_range *tipc_service_find_range(struct tipc_service *sc, struct tipc_uaddr *ua) { struct service_range *sr; service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { /* Look for exact match */ if (sr->lower == ua->sr.lower && sr->upper == ua->sr.upper) return sr; } return NULL; } static struct service_range *tipc_service_create_range(struct tipc_service *sc, struct publication *p) { struct rb_node **n, *parent = NULL; struct service_range *sr; u32 lower = p->sr.lower; u32 upper = p->sr.upper; n = &sc->ranges.rb_node; while (*n) { parent = *n; sr = service_range_entry(parent); if (lower == sr->lower && upper == sr->upper) return sr; if (sr->max < upper) sr->max = upper; if (lower <= sr->lower) n = &parent->rb_left; else n = &parent->rb_right; } sr = kzalloc(sizeof(*sr), GFP_ATOMIC); if (!sr) return NULL; sr->lower = lower; sr->upper = upper; sr->max = upper; INIT_LIST_HEAD(&sr->local_publ); INIT_LIST_HEAD(&sr->all_publ); rb_link_node(&sr->tree_node, parent, n); rb_insert_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); return sr; } static bool tipc_service_insert_publ(struct net *net, struct tipc_service *sc, struct publication *p) { struct tipc_subscription *sub, *tmp; struct service_range *sr; struct publication *_p; u32 node = p->sk.node; bool first = false; bool res = false; u32 key = p->key; spin_lock_bh(&sc->lock); sr = tipc_service_create_range(sc, p); if (!sr) goto exit; first = list_empty(&sr->all_publ); /* Return if the publication already exists */ list_for_each_entry(_p, &sr->all_publ, all_publ) { if (_p->key == key && (!_p->sk.node || _p->sk.node == node)) { pr_debug("Failed to bind duplicate %u,%u,%u/%u:%u/%u\n", p->sr.type, p->sr.lower, p->sr.upper, node, p->sk.ref, key); goto exit; } } if (in_own_node(net, p->sk.node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); p->id = sc->publ_cnt++; /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, first); } res = true; exit: if (!res) pr_warn("Failed to bind to %u,%u,%u\n", p->sr.type, p->sr.lower, p->sr.upper); spin_unlock_bh(&sc->lock); return res; } /** * tipc_service_remove_publ - remove a publication from a service * @r: service_range to remove publication from * @sk: address publishing socket * @key: target publication key */ static struct publication *tipc_service_remove_publ(struct service_range *r, struct tipc_socket_addr *sk, u32 key) { struct publication *p; u32 node = sk->node; list_for_each_entry(p, &r->all_publ, all_publ) { if (p->key != key || (node && node != p->sk.node)) continue; list_del(&p->all_publ); list_del(&p->local_publ); return p; } return NULL; } /* * Code reused: time_after32() for the same purpose */ #define publication_after(pa, pb) time_after32((pa)->id, (pb)->id) static int tipc_publ_sort(void *priv, const struct list_head *a, const struct list_head *b) { struct publication *pa, *pb; pa = container_of(a, struct publication, list); pb = container_of(b, struct publication, list); return publication_after(pa, pb); } /** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service * range overlapping with the requested range * @service: the tipc_service to attach the @sub to * @sub: the subscription to attach */ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct publication *p, *first, *tmp; struct list_head publ_list; struct service_range *sr; u32 filter, lower, upper; filter = sub->s.filter; lower = sub->s.seq.lower; upper = sub->s.seq.upper; tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); if (filter & TIPC_SUB_NO_STATUS) return; INIT_LIST_HEAD(&publ_list); service_range_foreach_match(sr, service, lower, upper) { first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { if (filter & TIPC_SUB_PORTS) list_add_tail(&p->list, &publ_list); else if (!first || publication_after(first, p)) /* Pick this range's *first* publication */ first = p; } if (first) list_add_tail(&first->list, &publ_list); } /* Sort the publications before reporting */ list_sort(NULL, &publ_list, tipc_publ_sort); list_for_each_entry_safe(p, tmp, &publ_list, list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, true); list_del_init(&p->list); } } static struct tipc_service *tipc_service_find(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct hlist_head *service_head; struct tipc_service *service; service_head = &nt->services[hash(ua->sr.type)]; hlist_for_each_entry_rcu(service, service_head, service_list) { if (service->type == ua->sr.type) return service; } return NULL; }; struct publication *tipc_nametbl_insert_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_service *sc; struct publication *p; p = tipc_publ_create(ua, sk, key); if (!p) return NULL; sc = tipc_service_find(net, ua); if (!sc) sc = tipc_service_create(net, ua); if (sc && tipc_service_insert_publ(net, sc, p)) return p; kfree(p); return NULL; } struct publication *tipc_nametbl_remove_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_subscription *sub, *tmp; struct publication *p = NULL; struct service_range *sr; struct tipc_service *sc; bool last; sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); sr = tipc_service_find_range(sc, ua); if (!sr) goto unlock; p = tipc_service_remove_publ(sr, sk, key); if (!p) goto unlock; /* Notify any waiting subscriptions */ last = list_empty(&sr->all_publ); list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_WITHDRAWN, last); } /* Remove service range item if this was its last publication */ if (list_empty(&sr->all_publ)) { rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } unlock: spin_unlock_bh(&sc->lock); exit: if (!p) { pr_err("Failed to remove unknown binding: %u,%u,%u/%u:%u/%u\n", ua->sr.type, ua->sr.lower, ua->sr.upper, sk->node, sk->ref, key); } return p; } /** * tipc_nametbl_lookup_anycast - perform service instance to socket translation * @net: network namespace * @ua: service address to look up * @sk: address to socket we want to find * * On entry, a non-zero 'sk->node' indicates the node where we want lookup to be * performed, which may not be this one. * * On exit: * * - If lookup is deferred to another node, leave 'sk->node' unchanged and * return 'true'. * - If lookup is successful, set the 'sk->node' and 'sk->ref' (== portid) which * represent the bound socket and return 'true'. * - If lookup fails, return 'false' * * Note that for legacy users (node configured with Z.C.N address format) the * 'closest-first' lookup algorithm must be maintained, i.e., if sk.node is 0 * we must look in the local binding list first */ bool tipc_nametbl_lookup_anycast(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk) { struct tipc_net *tn = tipc_net(net); bool legacy = tn->legacy_addr_format; u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *r; struct tipc_service *sc; struct publication *p; struct list_head *l; bool res = false; if (!tipc_in_scope(legacy, sk->node, self)) return true; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(r, sc, inst, inst) { /* Select lookup algo: local, closest-first or round-robin */ if (sk->node == self) { l = &r->local_publ; if (list_empty(l)) continue; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else if (legacy && !sk->node && !list_empty(&r->local_publ)) { l = &r->local_publ; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else { l = &r->all_publ; p = list_first_entry(l, struct publication, all_publ); list_move_tail(&p->all_publ, &r->all_publ); } *sk = p->sk; res = true; /* Todo: as for legacy, pick the first matching range only, a * "true" round-robin will be performed as needed. */ break; } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return res; } /* tipc_nametbl_lookup_group(): lookup destinaton(s) in a communication group * Returns a list of one (== group anycast) or more (== group multicast) * destination socket/node pairs matching the given address. * The requester may or may not want to exclude himself from the list. */ bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua, struct list_head *dsts, int *dstcnt, u32 exclude, bool mcast) { u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *sr; struct tipc_service *sc; struct publication *p; *dstcnt = 0; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); /* Todo: a full search i.e. service_range_foreach_match() instead? */ sr = service_range_match_first(sc->ranges.rb_node, inst, inst); if (!sr) goto no_match; list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; if (p->sk.ref == exclude && p->sk.node == self) continue; tipc_dest_push(dsts, p->sk.node, p->sk.ref); (*dstcnt)++; if (mcast) continue; list_move_tail(&p->all_publ, &sr->all_publ); break; } no_match: spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return !list_empty(dsts); } /* tipc_nametbl_lookup_mcast_sockets(): look up node local destinaton sockets * matching the given address * Used on nodes which have received a multicast/broadcast message * Returns a list of local sockets */ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, struct list_head *dports) { struct service_range *sr; struct tipc_service *sc; struct publication *p; u8 scope = ua->scope; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { if (scope == p->scope || scope == TIPC_ANY_SCOPE) tipc_dest_push(dports, 0, p->sk.ref); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_lookup_mcast_nodes(): look up all destination nodes matching * the given address. Used in sending node. * Used on nodes which are sending out a multicast/broadcast message * Returns a list of nodes, including own node if applicable */ void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua, struct tipc_nlist *nodes) { struct service_range *sr; struct tipc_service *sc; struct publication *p; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->all_publ, all_publ) { tipc_nlist_add(nodes, p->sk.node); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_build_group - build list of communication group members */ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, struct tipc_uaddr *ua) { struct service_range *sr; struct tipc_service *sc; struct publication *p; struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; tipc_group_add_member(grp, p->sk.node, p->sk.ref, p->sr.lower); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_publish - add service binding to name table */ struct publication *tipc_nametbl_publish(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *p = NULL; struct sk_buff *skb = NULL; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); if (nt->local_publ_count >= TIPC_MAX_PUBL) { pr_warn("Bind failed, max limit %u reached\n", TIPC_MAX_PUBL); goto exit; } p = tipc_nametbl_insert_publ(net, ua, sk, key); if (p) { nt->local_publ_count++; skb = tipc_named_publish(net, p); } rc_dests = nt->rc_dests; exit: spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); return p; } /** * tipc_nametbl_withdraw - withdraw a service binding * @net: network namespace * @ua: service address/range being unbound * @sk: address of the socket being unbound from * @key: target publication key */ void tipc_nametbl_withdraw(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct sk_buff *skb = NULL; struct publication *p; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); p = tipc_nametbl_remove_publ(net, ua, sk, key); if (p) { nt->local_publ_count--; skb = tipc_named_withdraw(net, p); list_del_init(&p->binding_sock); kfree_rcu(p, rcu); } rc_dests = nt->rc_dests; spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); } /** * tipc_nametbl_subscribe - add a subscription object to the name table * @sub: subscription to add */ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); u32 type = sub->s.seq.type; struct tipc_service *sc; struct tipc_uaddr ua; bool res = true; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) sc = tipc_service_create(sub->net, &ua); if (sc) { spin_lock_bh(&sc->lock); tipc_service_subscribe(sc, sub); spin_unlock_bh(&sc->lock); } else { pr_warn("Failed to subscribe for {%u,%u,%u}\n", type, sub->s.seq.lower, sub->s.seq.upper); res = false; } spin_unlock_bh(&tn->nametbl_lock); return res; } /** * tipc_nametbl_unsubscribe - remove a subscription object from name table * @sub: subscription to remove */ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); struct tipc_service *sc; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, sub->s.seq.type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); list_del_init(&sub->service_list); tipc_sub_put(sub); /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } spin_unlock_bh(&sc->lock); exit: spin_unlock_bh(&tn->nametbl_lock); } int tipc_nametbl_init(struct net *net) { struct tipc_net *tn = tipc_net(net); struct name_table *nt; int i; nt = kzalloc(sizeof(*nt), GFP_KERNEL); if (!nt) return -ENOMEM; for (i = 0; i < TIPC_NAMETBL_SIZE; i++) INIT_HLIST_HEAD(&nt->services[i]); INIT_LIST_HEAD(&nt->node_scope); INIT_LIST_HEAD(&nt->cluster_scope); rwlock_init(&nt->cluster_scope_lock); tn->nametbl = nt; spin_lock_init(&tn->nametbl_lock); return 0; } /** * tipc_service_delete - purge all publications for a service and delete it * @net: the associated network namespace * @sc: tipc_service to delete */ static void tipc_service_delete(struct net *net, struct tipc_service *sc) { struct service_range *sr, *tmpr; struct publication *p, *tmp; spin_lock_bh(&sc->lock); rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) { tipc_service_remove_publ(sr, &p->sk, p->key); kfree_rcu(p, rcu); } rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } hlist_del_init_rcu(&sc->service_list); spin_unlock_bh(&sc->lock); kfree_rcu(sc, rcu); } void tipc_nametbl_stop(struct net *net) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct hlist_head *service_head; struct tipc_service *service; u32 i; /* Verify name table is empty and purge any lingering * publications, then release the name table */ spin_lock_bh(&tn->nametbl_lock); for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { if (hlist_empty(&nt->services[i])) continue; service_head = &nt->services[i]; hlist_for_each_entry_rcu(service, service_head, service_list) { tipc_service_delete(net, service); } } spin_unlock_bh(&tn->nametbl_lock); /* TODO: clear tn->nametbl, implement proper RCU rules ? */ kfree_rcu(nt, rcu); } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, struct tipc_service *service, struct service_range *sr, u32 *last_key) { struct publication *p; struct nlattr *attrs; struct nlattr *b; void *hdr; if (*last_key) { list_for_each_entry(p, &sr->all_publ, all_publ) if (p->key == *last_key) break; if (list_entry_is_head(p, &sr->all_publ, all_publ)) return -EPIPE; } else { p = list_first_entry(&sr->all_publ, struct publication, all_publ); } list_for_each_entry_from(p, &sr->all_publ, all_publ) { *last_key = p->key; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NAME_TABLE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE); if (!attrs) goto msg_full; b = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); if (!b) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, service->type)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sr->lower)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sr->upper)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->sk.node)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->sk.ref)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key)) goto publ_msg_full; nla_nest_end(msg->skb, b); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); } *last_key = 0; return 0; publ_msg_full: nla_nest_cancel(msg->skb, b); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static int __tipc_nl_service_range_list(struct tipc_nl_msg *msg, struct tipc_service *sc, u32 *last_lower, u32 *last_key) { struct service_range *sr; struct rb_node *n; int err; for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); if (sr->lower < *last_lower) continue; err = __tipc_nl_add_nametable_publ(msg, sc, sr, last_key); if (err) { *last_lower = sr->lower; return err; } } *last_lower = 0; return 0; } static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, u32 *last_type, u32 *last_lower, u32 *last_key) { struct tipc_net *tn = tipc_net(net); struct tipc_service *service = NULL; struct hlist_head *head; struct tipc_uaddr ua; int err; int i; if (*last_type) i = hash(*last_type); else i = 0; for (; i < TIPC_NAMETBL_SIZE; i++) { head = &tn->nametbl->services[i]; if (*last_type || (!i && *last_key && (*last_lower == *last_key))) { tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, *last_type, *last_lower, *last_lower); service = tipc_service_find(net, &ua); if (!service) return -EPIPE; } else { hlist_for_each_entry_rcu(service, head, service_list) break; if (!service) continue; } hlist_for_each_entry_from_rcu(service, service_list) { spin_lock_bh(&service->lock); err = __tipc_nl_service_range_list(msg, service, last_lower, last_key); if (err) { *last_type = service->type; spin_unlock_bh(&service->lock); return err; } spin_unlock_bh(&service->lock); } *last_type = 0; } return 0; } int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 last_type = cb->args[0]; u32 last_lower = cb->args[1]; u32 last_key = cb->args[2]; int done = cb->args[3]; struct tipc_nl_msg msg; int err; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); err = tipc_nl_service_list(net, &msg, &last_type, &last_lower, &last_key); if (!err) { done = 1; } else if (err != -EMSGSIZE) { /* We never set seq or call nl_dump_check_consistent() this * means that setting prev_seq here will cause the consistence * check to fail in the netlink callback handler. Resulting in * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if * we got an error. */ cb->prev_seq = 1; } rcu_read_unlock(); cb->args[0] = last_type; cb->args[1] = last_lower; cb->args[2] = last_key; cb->args[3] = done; return skb->len; } struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; list_for_each_entry(dst, l, list) { if (dst->node == node && dst->port == port) return dst; } return NULL; } bool tipc_dest_push(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; if (tipc_dest_find(l, node, port)) return false; dst = kmalloc(sizeof(*dst), GFP_ATOMIC); if (unlikely(!dst)) return false; dst->node = node; dst->port = port; list_add(&dst->list, l); return true; } bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port) { struct tipc_dest *dst; if (list_empty(l)) return false; dst = list_first_entry(l, typeof(*dst), list); if (port) *port = dst->port; if (node) *node = dst->node; list_del(&dst->list); kfree(dst); return true; } bool tipc_dest_del(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; dst = tipc_dest_find(l, node, port); if (!dst) return false; list_del(&dst->list); kfree(dst); return true; } void tipc_dest_list_purge(struct list_head *l) { struct tipc_dest *dst, *tmp; list_for_each_entry_safe(dst, tmp, l, list) { list_del(&dst->list); kfree(dst); } }
16830 23 2 16717 24 24 22 3 24 58 48 174 86 86 86 198 6 1 91 1 86 218 217 198 90 218 8 8 6 3 540 540 7 16714 16712 16723 16710 16635 95 24 4 16712 2 16706 16712 16720 1 1 16709 16713 16712 282 280 1 1 282 16642 16638 1 1 16690 216 16690 281 16661 16698 8526 8524 8521 2 8516 5657 5658 5575 5575 5573 1 1 4 1 4 4 4 2 1 3 4 179 181 16714 16710 16717 16714 16714 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 // SPDX-License-Identifier: GPL-2.0-or-later /* audit.c -- Auditing support * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c * * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * * Goals: 1) Integrate fully with Security Modules. * 2) Minimal run-time overhead: * a) Minimal when syscall auditing is disabled (audit_enable=0). * b) Small when syscall auditing is enabled and no audit record * is generated (defer as much work as possible to record * generation time): * i) context is allocated, * ii) names from getname are stored without a copy, and * iii) inode information stored from path_lookup. * 3) Ability to disable syscall auditing at boot time (audit=0). * 4) Usable by other parts of the kernel (if audit_log* is called, * then a syscall record will be generated automatically for the * current syscall). * 5) Netlink interface to user-space. * 6) Support low-overhead kernel-based filtering to minimize the * information that must be passed to user-space. * * Audit userspace, documentation, tests, and bug/issue trackers: * https://github.com/linux-audit */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/file.h> #include <linux/init.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/pid.h> #include <linux/audit.h> #include <net/sock.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/security.h> #include <linux/lsm_hooks.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <net/netns/generic.h> #include "audit.h" /* No auditing will take place until audit_initialized == AUDIT_INITIALIZED. * (Initialization happens after skb_init is called.) */ #define AUDIT_DISABLED -1 #define AUDIT_UNINITIALIZED 0 #define AUDIT_INITIALIZED 1 static int audit_initialized = AUDIT_UNINITIALIZED; u32 audit_enabled = AUDIT_OFF; bool audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); /* Default state when kernel boots without any parameters. */ static u32 audit_default = AUDIT_OFF; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; /* Number of modules that provide a security context. List of lsms that provide a security context */ static u32 audit_subj_secctx_cnt; static u32 audit_obj_secctx_cnt; static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT]; /** * struct audit_net - audit private network namespace data * @sk: communication socket */ struct audit_net { struct sock *sk; }; /** * struct auditd_connection - kernel/auditd connection state * @pid: auditd PID * @portid: netlink portid * @net: the associated network namespace * @rcu: RCU head * * Description: * This struct is RCU protected; you must either hold the RCU lock for reading * or the associated spinlock for writing. */ struct auditd_connection { struct pid *pid; u32 portid; struct net *net; struct rcu_head rcu; }; static struct auditd_connection __rcu *auditd_conn; static DEFINE_SPINLOCK(auditd_conn_lock); /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in * audit records being dropped. */ static u32 audit_rate_limit; /* Number of outstanding audit_buffers allowed. * When set to zero, this means unlimited. */ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ static kuid_t audit_sig_uid = INVALID_UID; static pid_t audit_sig_pid = -1; static struct lsm_prop audit_sig_lsm; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] 2) out of memory in audit_log_move [alloc_skb] 3) suppressed due to audit_rate_limit 4) suppressed due to audit_backlog_limit */ static atomic_t audit_lost = ATOMIC_INIT(0); /* Monotonically increasing sum of time the kernel has spent * waiting while the backlog limit is exceeded. */ static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; static struct kmem_cache *audit_buffer_cache; /* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; /* queue msgs due to temporary unicast send problems */ static struct sk_buff_head audit_retry_queue; /* queue msgs waiting for new auditd connection */ static struct sk_buff_head audit_hold_queue; /* queue servicing thread */ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); /* waitqueue for callers who are blocked on the audit backlog */ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, .mask = -1, .features = 0, .lock = 0,}; static char *audit_feature_names[2] = { "only_unset_loginuid", "loginuid_immutable", }; /** * struct audit_ctl_mutex - serialize requests from userspace * @lock: the mutex used for locking * @owner: the task which owns the lock * * Description: * This is the lock struct used to ensure we only process userspace requests * in an orderly fashion. We can't simply use a mutex/lock here because we * need to track lock ownership so we don't end up blocking the lock owner in * audit_log_start() or similar. */ static struct audit_ctl_mutex { struct mutex lock; void *owner; } audit_cmd_mutex; /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer * should be at least that large. */ #define AUDIT_BUFSIZ 1024 /* The audit_buffer is used when formatting an audit record. The caller * locks briefly to get the record off the freelist or to allocate the * buffer, and locks briefly to send the buffer to the netlink layer or * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { struct sk_buff *skb; /* the skb for audit_log functions */ struct sk_buff_head skb_list; /* formatted skbs, ready to send */ struct audit_context *ctx; /* NULL or associated context */ struct audit_stamp stamp; /* audit stamp for these records */ gfp_t gfp_mask; }; struct audit_reply { __u32 portid; struct net *net; struct sk_buff *skb; }; /** * auditd_test_task - Check to see if a given task is an audit daemon * @task: the task to check * * Description: * Return 1 if the task is a registered audit daemon, 0 otherwise. */ int auditd_test_task(struct task_struct *task) { int rc; struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); rc = (ac && ac->pid == task_tgid(task) ? 1 : 0); rcu_read_unlock(); return rc; } /** * audit_ctl_lock - Take the audit control lock */ void audit_ctl_lock(void) { mutex_lock(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = current; } /** * audit_ctl_unlock - Drop the audit control lock */ void audit_ctl_unlock(void) { audit_cmd_mutex.owner = NULL; mutex_unlock(&audit_cmd_mutex.lock); } /** * audit_ctl_owner_current - Test to see if the current task owns the lock * * Description: * Return true if the current task owns the audit control lock, false if it * doesn't own the lock. */ static bool audit_ctl_owner_current(void) { return (current == audit_cmd_mutex.owner); } /** * auditd_pid_vnr - Return the auditd PID relative to the namespace * * Description: * Returns the PID in relation to the namespace, 0 on failure. */ static pid_t auditd_pid_vnr(void) { pid_t pid; const struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac || !ac->pid) pid = 0; else pid = pid_vnr(ac->pid); rcu_read_unlock(); return pid; } /** * audit_cfg_lsm - Identify a security module as providing a secctx. * @lsmid: LSM identity * @flags: which contexts are provided * * Description: * Increments the count of the security modules providing a secctx. * If the LSM id is already in the list leave it alone. */ void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) { int i; if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) { for (i = 0 ; i < audit_subj_secctx_cnt; i++) if (audit_subj_lsms[i] == lsmid) return; audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; } if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) { for (i = 0 ; i < audit_obj_secctx_cnt; i++) if (audit_obj_lsms[i] == lsmid) return; audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid; } } /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace * * Description: * Returns the sock pointer if valid, NULL otherwise. The caller must ensure * that a reference is held for the network namespace while the sock is in use. */ static struct sock *audit_get_sk(const struct net *net) { struct audit_net *aunet; if (!net) return NULL; aunet = net_generic(net, audit_net_id); return aunet->sk; } void audit_panic(const char *message) { switch (audit_failure) { case AUDIT_FAIL_SILENT: break; case AUDIT_FAIL_PRINTK: if (printk_ratelimit()) pr_err("%s\n", message); break; case AUDIT_FAIL_PANIC: panic("audit: %s\n", message); break; } } static inline int audit_rate_check(void) { static unsigned long last_check = 0; static int messages = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int retval = 0; if (!audit_rate_limit) return 1; spin_lock_irqsave(&lock, flags); if (++messages < audit_rate_limit) { retval = 1; } else { now = jiffies; if (time_after(now, last_check + HZ)) { last_check = now; messages = 0; retval = 1; } } spin_unlock_irqrestore(&lock, flags); return retval; } /** * audit_log_lost - conditionally log lost audit message event * @message: the message stating reason for lost audit message * * Emit at least 1 message per second, even if audit_rate_check is * throttling. * Always increment the lost messages counter. */ void audit_log_lost(const char *message) { static unsigned long last_msg = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int print; atomic_inc(&audit_lost); print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); if (!print) { spin_lock_irqsave(&lock, flags); now = jiffies; if (time_after(now, last_msg + HZ)) { print = 1; last_msg = now; } spin_unlock_irqrestore(&lock, flags); } if (print) { if (printk_ratelimit()) pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n", atomic_read(&audit_lost), audit_rate_limit, audit_backlog_limit); audit_panic(message); } } static int audit_log_config_change(char *function_name, u32 new, u32 old, int allow_changes) { struct audit_buffer *ab; int rc = 0; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) allow_changes = 0; /* Something weird, deny request */ audit_log_format(ab, " res=%d", allow_changes); audit_log_end(ab); return rc; } static int audit_do_config_change(char *function_name, u32 *to_change, u32 new) { int allow_changes, rc = 0; u32 old = *to_change; /* check if we are locked */ if (audit_enabled == AUDIT_LOCKED) allow_changes = 0; else allow_changes = 1; if (audit_enabled != AUDIT_OFF) { rc = audit_log_config_change(function_name, new, old, allow_changes); if (rc) allow_changes = 0; } /* If we are allowed, make the change */ if (allow_changes == 1) *to_change = new; /* Not allowed, update reason */ else if (rc == 0) rc = -EPERM; return rc; } static int audit_set_rate_limit(u32 limit) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); } static int audit_set_backlog_limit(u32 limit) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); } static int audit_set_backlog_wait_time(u32 timeout) { return audit_do_config_change("audit_backlog_wait_time", &audit_backlog_wait_time, timeout); } static int audit_set_enabled(u32 state) { int rc; if (state > AUDIT_LOCKED) return -EINVAL; rc = audit_do_config_change("audit_enabled", &audit_enabled, state); if (!rc) audit_ever_enabled |= !!state; return rc; } static int audit_set_failure(u32 state) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK && state != AUDIT_FAIL_PANIC) return -EINVAL; return audit_do_config_change("audit_failure", &audit_failure, state); } /** * auditd_conn_free - RCU helper to release an auditd connection struct * @rcu: RCU head * * Description: * Drop any references inside the auditd connection tracking struct and free * the memory. */ static void auditd_conn_free(struct rcu_head *rcu) { struct auditd_connection *ac; ac = container_of(rcu, struct auditd_connection, rcu); put_pid(ac->pid); put_net(ac->net); kfree(ac); } /** * auditd_set - Set/Reset the auditd connection state * @pid: auditd PID * @portid: auditd netlink portid * @net: auditd network namespace pointer * @skb: the netlink command from the audit daemon * @ack: netlink ack flag, cleared if ack'd here * * Description: * This function will obtain and drop network namespace references as * necessary. Returns zero on success, negative values on failure. */ static int auditd_set(struct pid *pid, u32 portid, struct net *net, struct sk_buff *skb, bool *ack) { unsigned long flags; struct auditd_connection *ac_old, *ac_new; struct nlmsghdr *nlh; if (!pid || !net) return -EINVAL; ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL); if (!ac_new) return -ENOMEM; ac_new->pid = get_pid(pid); ac_new->portid = portid; ac_new->net = get_net(net); /* send the ack now to avoid a race with the queue backlog */ if (*ack) { nlh = nlmsg_hdr(skb); netlink_ack(skb, nlh, 0, NULL); *ack = false; } spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); rcu_assign_pointer(auditd_conn, ac_new); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); return 0; } /** * kauditd_printk_skb - Print the audit record to the ring buffer * @skb: audit record * * Whatever the reason, this packet may not make it to the auditd connection * so write it via printk so the information isn't completely lost. */ static void kauditd_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); char *data = nlmsg_data(nlh); if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit()) pr_notice("type=%d %s\n", nlh->nlmsg_type, data); } /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { /* put the record back in the queue */ skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this * function is called we haven't given up yet on sending the record, but things * are not looking good. The first thing we want to do is try to write the * record via printk and then see if we want to try and hold on to the record * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ if (!audit_default) goto drop; /* the hold queue is only for when the daemon goes away completely, * not -EAGAIN failures; if we are in a -EAGAIN state requeue the * record on the retry queue unless it's full, in which case drop it */ if (error == -EAGAIN) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } audit_log_lost("kauditd retry queue overflow"); goto drop; } /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); return; } /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } /* we have to drop the record, send it via printk as a last effort */ kauditd_printk_skb(skb); audit_log_lost("kauditd retry queue overflow"); kfree_skb(skb); } /** * auditd_reset - Disconnect the auditd connection * @ac: auditd connection state * * Description: * Break the auditd/kauditd connection and move all the queued records into the * hold queue in case auditd reconnects. It is important to note that the @ac * pointer should never be dereferenced inside this function as it may be NULL * or invalid, you can only compare the memory address! If @ac is NULL then * the connection will always be reset. */ static void auditd_reset(const struct auditd_connection *ac) { unsigned long flags; struct sk_buff *skb; struct auditd_connection *ac_old; /* if it isn't already broken, break the connection */ spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); if (ac && ac != ac_old) { /* someone already registered a new auditd connection */ spin_unlock_irqrestore(&auditd_conn_lock, flags); return; } rcu_assign_pointer(auditd_conn, NULL); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) kauditd_hold_skb(skb, -ECONNREFUSED); } /** * auditd_send_unicast_skb - Send a record via unicast to auditd * @skb: audit record * * Description: * Send a skb to the audit daemon, returns positive/zero values on success and * negative values on failure; in all cases the skb will be consumed by this * function. If the send results in -ECONNREFUSED the connection with auditd * will be reset. This function may sleep so callers should not hold any locks * where this would cause a problem. */ static int auditd_send_unicast_skb(struct sk_buff *skb) { int rc; u32 portid; struct net *net; struct sock *sk; struct auditd_connection *ac; /* NOTE: we can't call netlink_unicast while in the RCU section so * take a reference to the network namespace and grab local * copies of the namespace, the sock, and the portid; the * namespace and sock aren't going to go away while we hold a * reference and if the portid does become invalid after the RCU * section netlink_unicast() should safely return an error */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); kfree_skb(skb); rc = -ECONNREFUSED; goto err; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); rc = netlink_unicast(sk, skb, portid, 0); put_net(net); if (rc < 0) goto err; return rc; err: if (ac && rc == -ECONNREFUSED) auditd_reset(ac); return rc; } /** * kauditd_send_queue - Helper for kauditd_thread to flush skb queues * @sk: the sending sock * @portid: the netlink destination * @queue: the skb queue to process * @retry_limit: limit on number of netlink unicast failures * @skb_hook: per-skb hook for additional processing * @err_hook: hook called if the skb fails the netlink unicast send * * Description: * Run through the given queue and attempt to send the audit records to auditd, * returns zero on success, negative values on failure. It is up to the caller * to ensure that the @sk is valid for the duration of this function. * */ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; struct sk_buff *skb = NULL; struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ skb_tail = skb_peek_tail(queue); while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) (*err_hook)(skb, -ECONNREFUSED); continue; } retry: /* grab an extra skb reference in case of error */ skb_get(skb); rc = netlink_unicast(sk, skb, portid, 0); if (rc < 0) { /* send failed - try a few times unless fatal error */ if (++failed >= retry_limit || rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ continue; } else goto retry; } else { /* skb sent - drop the extra reference and continue */ consume_skb(skb); failed = 0; } } return (rc >= 0 ? 0 : rc); } /* * kauditd_send_multicast_skb - Send a record to any multicast listeners * @skb: audit record * * Description: * Write a multicast message to anyone listening in the initial network * namespace. This function doesn't consume an skb as might be expected since * it has to copy it anyways. */ static void kauditd_send_multicast_skb(struct sk_buff *skb) { struct sk_buff *copy; struct sock *sock = audit_get_sk(&init_net); struct nlmsghdr *nlh; /* NOTE: we are not taking an additional reference for init_net since * we don't have to worry about it going away */ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) return; /* * The seemingly wasteful skb_copy() rather than bumping the refcount * using skb_get() is necessary because non-standard mods are made to * the skb by the original kaudit unicast socket send routine. The * existing auditd daemon assumes this breakage. Fixing this would * require co-ordinating a change in the established protocol between * the kaudit kernel subsystem and the auditd userspace code. There is * no reason for new multicast clients to continue with this * non-compliance. */ copy = skb_copy(skb, GFP_KERNEL); if (!copy) return; nlh = nlmsg_hdr(copy); nlh->nlmsg_len = skb->len; nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); } /** * kauditd_thread - Worker thread to send audit records to userspace * @dummy: unused */ static int kauditd_thread(void *dummy) { int rc; u32 portid = 0; struct net *net = NULL; struct sock *sk = NULL; struct auditd_connection *ac; #define UNICAST_RETRIES 5 set_freezable(); while (!kthread_should_stop()) { /* NOTE: see the lock comments in auditd_send_unicast_skb() */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); goto main_queue; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); /* attempt to flush the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } /* attempt to flush the retry queue */ rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } main_queue: /* process the main queue - do the multicast send and attempt * unicast, dump failed record sends to the retry queue; if * sk == NULL due to previous failures we will just do the * multicast send and move the record to the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_queue, 1, kauditd_send_multicast_skb, (sk ? kauditd_retry_skb : kauditd_hold_skb)); if (ac && rc < 0) auditd_reset(ac); sk = NULL; /* drop our netns reference, no auditd sends past this line */ if (net) { put_net(net); net = NULL; } /* we have processed all the queues so wake everyone */ wake_up(&audit_backlog_wait); /* NOTE: we want to wake up if there is anything on the queue, * regardless of if an auditd is connected, as we need to * do the multicast send and rotate records from the * main queue to the retry/hold queues */ wait_event_freezable(kauditd_wait, (skb_queue_len(&audit_queue) ? 1 : 0)); } return 0; } int audit_send_list_thread(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; struct sock *sk = audit_get_sk(dest->net); /* wait for parent to finish and send an ACK */ audit_ctl_lock(); audit_ctl_unlock(); while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(sk, skb, dest->portid, 0); put_net(dest->net); kfree(dest); return 0; } struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; void *data; int flags = multi ? NLM_F_MULTI : 0; int t = done ? NLMSG_DONE : type; skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, 0, seq, t, size, flags); if (!nlh) goto out_kfree_skb; data = nlmsg_data(nlh); memcpy(data, payload, size); return skb; out_kfree_skb: kfree_skb(skb); return NULL; } static void audit_free_reply(struct audit_reply *reply) { if (!reply) return; kfree_skb(reply->skb); if (reply->net) put_net(reply->net); kfree(reply); } static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; audit_ctl_lock(); audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0); reply->skb = NULL; audit_free_reply(reply); return 0; } /** * audit_send_reply - send an audit reply message via netlink * @request_skb: skb of request we are replying to (used to target the reply) * @seq: sequence number * @type: audit message type * @done: done (last) flag * @multi: multi-part message flag * @payload: payload data * @size: payload size * * Allocates a skb, builds the netlink message, and sends it to the port id. */ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { struct task_struct *tsk; struct audit_reply *reply; reply = kzalloc(sizeof(*reply), GFP_KERNEL); if (!reply) return; reply->skb = audit_make_reply(seq, type, done, multi, payload, size); if (!reply->skb) goto err; reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); reply->portid = NETLINK_CB(request_skb).portid; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); if (IS_ERR(tsk)) goto err; return; err: audit_free_reply(reply); } /* * Check for appropriate CAP_AUDIT_ capabilities on incoming audit * control messages. */ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; /* Only support initial user namespace for now. */ /* * We return ECONNREFUSED because it tricks userspace into thinking * that audit was not configured into the kernel. Lots of users * configure their PAM stack (because that's what the distro does) * to reject login if unable to send messages to audit. If we return * ECONNREFUSED the PAM stack thinks the kernel does not have audit * configured in and will let login proceed. If we return EPERM * userspace will reject all logins. This should be removed when we * support non init namespaces!! */ if (current_user_ns() != &init_user_ns) return -ECONNREFUSED; switch (msg_type) { case AUDIT_LIST: case AUDIT_ADD: case AUDIT_DEL: return -EOPNOTSUPP; case AUDIT_GET: case AUDIT_SET: case AUDIT_GET_FEATURE: case AUDIT_SET_FEATURE: case AUDIT_LIST_RULES: case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: case AUDIT_TTY_GET: case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: /* Only support auditd and auditctl in initial pid namespace * for now. */ if (task_active_pid_ns(current) != &init_pid_ns) return -EPERM; if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!netlink_capable(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ err = -EINVAL; } return err; } static void audit_log_common_recv_msg(struct audit_context *context, struct audit_buffer **ab, u16 msg_type) { uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; return; } *ab = audit_log_start(context, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); } static inline void audit_log_user_recv_msg(struct audit_buffer **ab, u16 msg_type) { audit_log_common_recv_msg(NULL, ab, msg_type); } static int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); } static int audit_get_feature(struct sk_buff *skb) { u32 seq; seq = nlmsg_hdr(skb)->nlmsg_seq; audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af)); return 0; } static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature, u32 old_lock, u32 new_lock, int res) { struct audit_buffer *ab; if (audit_enabled == AUDIT_OFF) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE); if (!ab) return; audit_log_task_info(ab); audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); audit_log_end(ab); } static int audit_set_feature(struct audit_features *uaf) { int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); /* if there is ever a version 2 we should handle that here */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; new_lock = (uaf->lock | af.lock) & feature; old_lock = af.lock & feature; /* are we changing a locked feature? */ if (old_lock && (new_feature != old_feature)) { audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 0); return -EPERM; } } /* nothing invalid, do the changes */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; old_lock = af.lock & feature; new_lock = (uaf->lock | af.lock) & feature; if (new_feature != old_feature) audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 1); if (new_feature) af.features |= feature; else af.features &= ~feature; af.lock |= new_lock; } return 0; } static int audit_replace(struct pid *pid) { pid_t pvnr; struct sk_buff *skb; pvnr = pid_vnr(pid); skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); if (!skb) return -ENOMEM; return auditd_send_unicast_skb(skb); } static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, bool *ack) { u32 seq; void *data; int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; struct audit_sig_info *sig_data; struct lsm_context lsmctx = { NULL, 0, 0 }; err = audit_netlink_ok(skb, msg_type); if (err) return err; seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); s.enabled = audit_enabled; s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ s.pid = auditd_pid_vnr(); s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; s.backlog_wait_time = audit_backlog_wait_time; s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_SET: { struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(s.failure); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_PID) { /* NOTE: we are using the vnr PID functions below * because the s.pid value is relative to the * namespace of the caller; at present this * doesn't matter much since you can really only * run auditd from the initial pid namespace, but * something to keep in mind if this changes */ pid_t new_pid = s.pid; pid_t auditd_pid; struct pid *req_pid = task_tgid(current); /* Sanity check - PID values must match. Setting * pid to 0 is how auditd ends auditing. */ if (new_pid && (new_pid != pid_vnr(req_pid))) return -EINVAL; /* test the auditd connection */ audit_replace(req_pid); auditd_pid = auditd_pid_vnr(); if (auditd_pid) { /* replacing a healthy auditd is not allowed */ if (new_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EEXIST; } /* only current auditd can unregister itself */ if (pid_vnr(req_pid) != auditd_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EACCES; } } if (new_pid) { /* register a new auditd connection */ err = auditd_set(req_pid, NETLINK_CB(skb).portid, sock_net(NETLINK_CB(skb).sk), skb, ack); if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, err ? 0 : 1); if (err) return err; /* try to process any backlog */ wake_up_interruptible(&kauditd_wait); } else { if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, 1); /* unregister the auditd connection */ auditd_reset(NULL); } } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) { err = audit_set_backlog_limit(s.backlog_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) { if (sizeof(s) > (size_t)nlh->nlmsg_len) return -EINVAL; if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) return -EINVAL; err = audit_set_backlog_wait_time(s.backlog_wait_time); if (err < 0) return err; } if (s.mask == AUDIT_STATUS_LOST) { u32 lost = atomic_xchg(&audit_lost, 0); audit_log_config_change("lost", 0, lost, 1); return lost; } if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); return actual; } break; } case AUDIT_GET_FEATURE: err = audit_get_feature(skb); if (err) return err; break; case AUDIT_SET_FEATURE: if (data_len < sizeof(struct audit_features)) return -EINVAL; err = audit_set_feature(data); if (err) return err; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; /* exit early if there isn't at least one character to print */ if (data_len < 2) return -EINVAL; err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ char *str = data; err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); if (err) break; } audit_log_user_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) { /* ensure NULL termination */ str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, str); } else { audit_log_format(ab, " data="); if (str[data_len - 1] == '\0') data_len--; audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=%s audit_enabled=%d res=0", msg_type == AUDIT_ADD_RULE ? "add_rule" : "remove_rule", audit_enabled); audit_log_end(ab); return -EPERM; } err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: audit_trim_trees(); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; size_t msglen = data_len; char *old, *new; err = -EINVAL; if (msglen < 2 * sizeof(u32)) break; memcpy(sizes, bufp, 2 * sizeof(u32)); bufp += 2 * sizeof(u32); msglen -= 2 * sizeof(u32); old = audit_unpack_string(&bufp, &msglen, sizes[0]); if (IS_ERR(old)) { err = PTR_ERR(old); break; } new = audit_unpack_string(&bufp, &msglen, sizes[1]); if (IS_ERR(new)) { err = PTR_ERR(new); kfree(old); break; } /* OK, here comes... */ err = audit_tag_tree(old, new); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); audit_log_format(ab, " new="); audit_log_untrustedstring(ab, new); audit_log_format(ab, " res=%d", !err); audit_log_end(ab); kfree(old); kfree(new); break; } case AUDIT_SIGNAL_INFO: if (lsmprop_is_set(&audit_sig_lsm)) { err = security_lsmprop_to_secctx(&audit_sig_lsm, &lsmctx, LSM_ID_UNDEF); if (err < 0) return err; } sig_data = kmalloc(struct_size(sig_data, ctx, lsmctx.len), GFP_KERNEL); if (!sig_data) { if (lsmprop_is_set(&audit_sig_lsm)) security_release_secctx(&lsmctx); return -ENOMEM; } sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; if (lsmprop_is_set(&audit_sig_lsm)) { memcpy(sig_data->ctx, lsmctx.context, lsmctx.len); security_release_secctx(&lsmctx); } audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, struct_size(sig_data, ctx, lsmctx.len)); kfree(sig_data); break; case AUDIT_TTY_GET: { struct audit_tty_status s; unsigned int t; t = READ_ONCE(current->signal->audit_tty); s.enabled = t & AUDIT_TTY_ENABLE; s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status s, old; struct audit_buffer *ab; unsigned int t; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) err = -EINVAL; if (err) t = READ_ONCE(current->signal->audit_tty); else { t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD); t = xchg(&current->signal->audit_tty, t); } old.enabled = t & AUDIT_TTY_ENABLE; old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" " old-log_passwd=%d new-log_passwd=%d res=%d", old.enabled, s.enabled, old.log_passwd, s.log_passwd, !err); audit_log_end(ab); break; } default: err = -EINVAL; break; } return err < 0 ? err : 0; } /** * audit_receive - receive messages from a netlink control socket * @skb: the message buffer * * Parse the provided skb and deal with any messages that may be present, * malformed skbs are discarded. */ static void audit_receive(struct sk_buff *skb) { struct nlmsghdr *nlh; bool ack; /* * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned */ int len; int err; nlh = nlmsg_hdr(skb); len = skb->len; audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { ack = nlh->nlmsg_flags & NLM_F_ACK; err = audit_receive_msg(skb, nlh, &ack); /* send an ack if the user asked for one and audit_receive_msg * didn't already do it, or if there was an error. */ if (ack || err) netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); } audit_ctl_unlock(); /* can't block with the ctrl lock, so penalize the sender now */ if (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { DECLARE_WAITQUEUE(wait, current); /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(audit_backlog_wait_time); remove_wait_queue(&audit_backlog_wait, &wait); } } /* Log information about who is connecting to the audit multicast socket */ static void audit_log_multicast(int group, const char *op, int err) { const struct cred *cred; struct tty_struct *tty; char comm[sizeof(current->comm)]; struct audit_buffer *ab; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", task_tgid_nr(current), from_kuid(&init_user_ns, cred->uid), from_kuid(&init_user_ns, audit_get_loginuid(current)), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_task_context(ab); /* subj= */ audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); /* exe= */ audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); audit_log_end(ab); } /* Run custom bind function on netlink socket group connect or bind requests. */ static int audit_multicast_bind(struct net *net, int group) { int err = 0; if (!capable(CAP_AUDIT_READ)) err = -EPERM; audit_log_multicast(group, "connect", err); return err; } static void audit_multicast_unbind(struct net *net, int group) { audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, .bind = audit_multicast_bind, .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; struct audit_net *aunet = net_generic(net, audit_net_id); aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); if (aunet->sk == NULL) { audit_panic("cannot initialize netlink socket in namespace"); return -ENOMEM; } /* limit the timeout in case auditd is blocked/stopped */ aunet->sk->sk_sndtimeo = HZ / 10; return 0; } static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); /* NOTE: you would think that we would want to check the auditd * connection and potentially reset it here if it lives in this * namespace, but since the auditd connection tracking struct holds a * reference to this namespace (see auditd_set()) we are only ever * going to get here after that connection has been released */ netlink_kernel_release(aunet->sk); } static struct pernet_operations audit_net_ops __net_initdata = { .init = audit_net_init, .exit = audit_net_exit, .id = &audit_net_id, .size = sizeof(struct audit_net), }; /* Initialize audit support at boot time. */ static int __init audit_init(void) { int i; if (audit_initialized == AUDIT_DISABLED) return 0; audit_buffer_cache = KMEM_CACHE(audit_buffer, SLAB_PANIC); skb_queue_head_init(&audit_queue); skb_queue_head_init(&audit_retry_queue); skb_queue_head_init(&audit_hold_queue); for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); mutex_init(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = NULL; pr_info("initializing netlink subsys (%s)\n", str_enabled_disabled(audit_default)); register_pernet_subsys(&audit_net_ops); audit_initialized = AUDIT_INITIALIZED; kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { int err = PTR_ERR(kauditd_task); panic("audit: failed to start the kauditd thread (%d)\n", err); } audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "state=initialized audit_enabled=%u res=1", audit_enabled); return 0; } postcore_initcall(audit_init); /* * Process kernel command-line parameter at boot time. * audit={0|off} or audit={1|on}. */ static int __init audit_enable(char *str) { if (!strcasecmp(str, "off") || !strcmp(str, "0")) audit_default = AUDIT_OFF; else if (!strcasecmp(str, "on") || !strcmp(str, "1")) audit_default = AUDIT_ON; else { pr_err("audit: invalid 'audit' parameter value (%s)\n", str); audit_default = AUDIT_ON; } if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; if (audit_set_enabled(audit_default)) pr_err("audit: error setting audit state (%d)\n", audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); return 1; } __setup("audit=", audit_enable); /* Process kernel command-line parameter at boot time. * audit_backlog_limit=<n> */ static int __init audit_backlog_limit_set(char *str) { u32 audit_backlog_limit_arg; pr_info("audit_backlog_limit: "); if (kstrtouint(str, 0, &audit_backlog_limit_arg)) { pr_cont("using default of %u, unable to parse %s\n", audit_backlog_limit, str); return 1; } audit_backlog_limit = audit_backlog_limit_arg; pr_cont("%d\n", audit_backlog_limit); return 1; } __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { struct sk_buff *skb; if (!ab) return; while ((skb = skb_dequeue(&ab->skb_list))) kfree_skb(skb); kmem_cache_free(audit_buffer_cache, ab); } static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask); if (!ab) return NULL; skb_queue_head_init(&ab->skb_list); ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; skb_queue_tail(&ab->skb_list, ab->skb); if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; ab->ctx = ctx; ab->gfp_mask = gfp_mask; return ab; err: audit_buffer_free(ab); return NULL; } /** * audit_serial - compute a serial number for the audit record * * Compute a serial number for the audit record. Audit records are * written to user-space as soon as they are generated, so a complete * audit record may be written in several pieces. The timestamp of the * record and this serial number are used by the user-space tools to * determine which pieces belong to the same audit record. The * (timestamp,serial) tuple is unique for each syscall and is live from * syscall entry to syscall exit. * * NOTE: Another possibility is to store the formatted records off the * audit context (for those records that have a context), and emit them * all at syscall exit. However, this could delay the reporting of * significant errors until syscall exit (or never, if the system * halts). */ unsigned int audit_serial(void) { static atomic_t serial = ATOMIC_INIT(0); return atomic_inc_return(&serial); } static inline void audit_get_stamp(struct audit_context *ctx, struct audit_stamp *stamp) { if (!ctx || !auditsc_get_stamp(ctx, stamp)) { ktime_get_coarse_real_ts64(&stamp->ctime); stamp->serial = audit_serial(); } } /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) * @gfp_mask: type of allocation * @type: audit message type * * Returns audit_buffer pointer on success or NULL on error. * * Obtain an audit buffer. This routine does locking to obtain the * audit buffer, but then no locking is required for calls to * audit_log_*format. If the task (ctx) is a task that is currently in a * syscall, then the syscall is marked as auditable and an audit record * will be written at syscall exit. If there is no associated task, then * task context (ctx) should be NULL. */ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; if (audit_initialized != AUDIT_INITIALIZED) return NULL; if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE))) return NULL; /* NOTE: don't ever fail/sleep on these two conditions: * 1. auditd generated record - since we need auditd to drain the * queue; also, when we are checking for auditd, compare PIDs using * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block * while holding the mutex, although we do penalize the sender * later in audit_receive() when it is safe to block */ if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; while (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { long rtime = stime; DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); stime = schedule_timeout(rtime); atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", skb_queue_len(&audit_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); return NULL; } } } ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); return NULL; } audit_get_stamp(ab->ctx, &ab->stamp); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)ab->stamp.ctime.tv_sec, ab->stamp.ctime.tv_nsec/1000000, ab->stamp.serial); return ab; } /** * audit_expand - expand skb in the audit buffer * @ab: audit_buffer * @extra: space to add at tail of the skb * * Returns 0 (no space) on failed expansion, or available space if * successful. */ static inline int audit_expand(struct audit_buffer *ab, int extra) { struct sk_buff *skb = ab->skb; int oldtail = skb_tailroom(skb); int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask); int newtail = skb_tailroom(skb); if (ret < 0) { audit_log_lost("out of memory in audit_expand"); return 0; } skb->truesize += newtail - oldtail; return newtail; } /* * Format an audit message into the audit buffer. If there isn't enough * room in the audit buffer, more room will be allocated and vsnprint * will be called a second time. Currently, we assume that a printk * can't format message larger than 1024 bytes, so we don't either. */ static __printf(2, 0) void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { int len, avail; struct sk_buff *skb; va_list args2; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); if (avail == 0) { avail = audit_expand(ab, AUDIT_BUFSIZ); if (!avail) goto out; } va_copy(args2, args); len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args); if (len >= avail) { /* The printk buffer is 1024 bytes long, so if we get * here and AUDIT_BUFSIZ is at least 1024, then we can * log everything that printk could have logged. */ avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) goto out_va_end; len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); } if (len > 0) skb_put(skb, len); out_va_end: va_end(args2); out: return; } /** * audit_log_format - format a message into the audit buffer. * @ab: audit_buffer * @fmt: format string * @...: optional parameters matching @fmt string * * All the work is done in audit_log_vformat. */ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { va_list args; if (!ab) return; va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); } /** * audit_log_n_hex - convert a buffer to hex and append it to the audit skb * @ab: the audit_buffer * @buf: buffer to convert to hex * @len: length of @buf to be converted * * No return value; failure to expand is silently ignored. * * This function will take the passed buf and convert it into a string of * ascii hex digits. The new string is placed onto the skb. */ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { int i, avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = len<<1; if (new_len >= avail) { /* Round the buffer request up to the next multiple */ new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); for (i = 0; i < len; i++) ptr = hex_byte_pack_upper(ptr, buf[i]); *ptr = 0; skb_put(skb, len << 1); /* new string is twice the old string */ } /* * Format a string of no more than slen characters into the audit buffer, * enclosed in quote marks. */ void audit_log_n_string(struct audit_buffer *ab, const char *string, size_t slen) { int avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = slen + 3; /* enclosing quotes + null terminator */ if (new_len > avail) { avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); *ptr++ = '"'; memcpy(ptr, string, slen); ptr += slen; *ptr++ = '"'; *ptr = 0; skb_put(skb, slen + 2); /* don't include null terminator */ } /** * audit_string_contains_control - does a string need to be logged in hex * @string: string to be checked * @len: max length of the string to check */ bool audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len; p++) { if (*p == '"' || *p < 0x21 || *p > 0x7e) return true; } return false; } /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @string: string to be logged * @len: length of string (not including trailing null) * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). * * The caller specifies the number of characters in the string to log, which may * or may not be the entire string. */ void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t len) { if (audit_string_contains_control(string, len)) audit_log_n_hex(ab, string, len); else audit_log_n_string(ab, string, len); } /** * audit_log_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @string: string to be logged * * Same as audit_log_n_untrustedstring(), except that strlen is used to * determine string length. */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { audit_log_n_untrustedstring(ab, string, strlen(string)); } /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { char *p, *pathname; if (prefix) audit_log_format(ab, "%s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!pathname) { audit_log_format(ab, "\"<no_memory>\""); return; } p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ audit_log_format(ab, "\"<too_long>\""); } else audit_log_untrustedstring(ab, p); kfree(pathname); } void audit_log_session_info(struct audit_buffer *ab) { unsigned int sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); audit_log_format(ab, "auid=%u ses=%u", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) { audit_log_format(ab, " key="); if (key) audit_log_untrustedstring(ab, key); else audit_log_format(ab, "(null)"); } /** * audit_buffer_aux_new - Add an aux record buffer to the skb list * @ab: audit_buffer * @type: message type * * Aux records are allocated and added to the skb list of * the "main" record. The ab->skb is reset to point to the * aux record on its creation. When the aux record in complete * ab->skb has to be reset to point to the "main" record. * This allows the audit_log_ functions to be ignorant of * which kind of record it is logging to. It also avoids adding * special data for aux records. * * On success ab->skb will point to the new aux record. * Returns 0 on success, -ENOMEM should allocation fail. */ static int audit_buffer_aux_new(struct audit_buffer *ab, int type) { WARN_ON(ab->skb != skb_peek(&ab->skb_list)); ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask); if (!ab->skb) goto err; if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; skb_queue_tail(&ab->skb_list, ab->skb); audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)ab->stamp.ctime.tv_sec, ab->stamp.ctime.tv_nsec/1000000, ab->stamp.serial); return 0; err: kfree_skb(ab->skb); ab->skb = skb_peek(&ab->skb_list); return -ENOMEM; } /** * audit_buffer_aux_end - Switch back to the "main" record from an aux record * @ab: audit_buffer * * Restores the "main" audit record to ab->skb. */ static void audit_buffer_aux_end(struct audit_buffer *ab) { ab->skb = skb_peek(&ab->skb_list); } /** * audit_log_subj_ctx - Add LSM subject information * @ab: audit_buffer * @prop: LSM subject properties. * * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record. */ int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { struct lsm_context ctx; char *space = ""; int error; int i; security_current_getlsmprop_subj(prop); if (!lsmprop_is_set(prop)) return 0; if (audit_subj_secctx_cnt < 2) { error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); if (error < 0) { if (error != -EINVAL) goto error_path; return 0; } audit_log_format(ab, " subj=%s", ctx.context); security_release_secctx(&ctx); return 0; } /* Multiple LSMs provide contexts. Include an aux record. */ audit_log_format(ab, " subj=?"); error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS); if (error) goto error_path; for (i = 0; i < audit_subj_secctx_cnt; i++) { error = security_lsmprop_to_secctx(prop, &ctx, audit_subj_lsms[i]->id); if (error < 0) { /* * Don't print anything. An LSM like BPF could * claim to support contexts, but only do so under * certain conditions. */ if (error == -EOPNOTSUPP) continue; if (error != -EINVAL) audit_panic("error in audit_log_subj_ctx"); } else { audit_log_format(ab, "%ssubj_%s=%s", space, audit_subj_lsms[i]->name, ctx.context); space = " "; security_release_secctx(&ctx); } } audit_buffer_aux_end(ab); return 0; error_path: audit_panic("error in audit_log_subj_ctx"); return error; } EXPORT_SYMBOL(audit_log_subj_ctx); int audit_log_task_context(struct audit_buffer *ab) { struct lsm_prop prop; security_current_getlsmprop_subj(&prop); return audit_log_subj_ctx(ab, &prop); } EXPORT_SYMBOL(audit_log_task_context); int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { int i; int rc; int error = 0; char *space = ""; struct lsm_context ctx; if (audit_obj_secctx_cnt < 2) { error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); if (error < 0) { if (error != -EINVAL) goto error_path; return error; } audit_log_format(ab, " obj=%s", ctx.context); security_release_secctx(&ctx); return 0; } audit_log_format(ab, " obj=?"); error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS); if (error) goto error_path; for (i = 0; i < audit_obj_secctx_cnt; i++) { rc = security_lsmprop_to_secctx(prop, &ctx, audit_obj_lsms[i]->id); if (rc < 0) { audit_log_format(ab, "%sobj_%s=?", space, audit_obj_lsms[i]->name); if (rc != -EINVAL) audit_panic("error in audit_log_obj_ctx"); error = rc; } else { audit_log_format(ab, "%sobj_%s=%s", space, audit_obj_lsms[i]->name, ctx.context); security_release_secctx(&ctx); } space = " "; } audit_buffer_aux_end(ab); return error; error_path: audit_panic("error in audit_log_obj_ctx"); return error; } void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { struct file *exe_file; if (!mm) goto out_null; exe_file = get_mm_exe_file(mm); if (!exe_file) goto out_null; audit_log_d_path(ab, " exe=", &exe_file->f_path); fput(exe_file); return; out_null: audit_log_format(ab, " exe=(null)"); } struct tty_struct *audit_get_tty(void) { struct tty_struct *tty = NULL; unsigned long flags; spin_lock_irqsave(&current->sighand->siglock, flags); if (current->signal) tty = tty_kref_get(current->signal->tty); spin_unlock_irqrestore(&current->sighand->siglock, flags); return tty; } void audit_put_tty(struct tty_struct *tty) { tty_kref_put(tty); } void audit_log_task_info(struct audit_buffer *ab) { const struct cred *cred; char comm[sizeof(current->comm)]; struct tty_struct *tty; if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(current), task_tgid_nr(current), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), from_kuid(&init_user_ns, cred->euid), from_kuid(&init_user_ns, cred->suid), from_kuid(&init_user_ns, cred->fsuid), from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); audit_log_task_context(ab); } EXPORT_SYMBOL(audit_log_task_info); /** * audit_log_path_denied - report a path restriction denial * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc) * @operation: specific operation name */ void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; if (!audit_enabled) return; /* Generate log with subject, operation, outcome. */ ab = audit_log_start(audit_context(), GFP_KERNEL, type); if (!ab) return; audit_log_format(ab, "op=%s", operation); audit_log_task_info(ab); audit_log_format(ab, " res=0"); audit_log_end(ab); } /* global counter which is incremented every time something logs in */ static atomic_t session_id = ATOMIC_INIT(0); static int audit_set_loginuid_perm(kuid_t loginuid) { /* if we are unset, we don't need privs */ if (!audit_loginuid_set(current)) return 0; /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) return -EPERM; /* it is set, you need permission */ if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; /* reject if this is not an unset and we don't allow that */ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) return -EPERM; return 0; } static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, unsigned int oldsessionid, unsigned int sessionid, int rc) { struct audit_buffer *ab; uid_t uid, oldloginuid, loginuid; struct tty_struct *tty; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid); tty = audit_get_tty(); audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", oldsessionid, sessionid, !rc); audit_put_tty(tty); audit_log_end(ab); } /** * audit_set_loginuid - set current task's loginuid * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ int audit_set_loginuid(kuid_t loginuid) { unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; kuid_t oldloginuid; int rc; oldloginuid = audit_get_loginuid(current); oldsessionid = audit_get_sessionid(current); rc = audit_set_loginuid_perm(loginuid); if (rc) goto out; /* are we setting or clearing? */ if (uid_valid(loginuid)) { sessionid = (unsigned int)atomic_inc_return(&session_id); if (unlikely(sessionid == AUDIT_SID_UNSET)) sessionid = (unsigned int)atomic_inc_return(&session_id); } current->sessionid = sessionid; current->loginuid = loginuid; out: audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); return rc; } /** * audit_signal_info - record signal info for shutting down audit subsystem * @sig: signal value * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ int audit_signal_info(int sig, struct task_struct *t) { kuid_t uid = current_uid(), auid; if (auditd_test_task(t) && (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2)) { audit_sig_pid = task_tgid_nr(current); auid = audit_get_loginuid(current); if (uid_valid(auid)) audit_sig_uid = auid; else audit_sig_uid = uid; security_current_getlsmprop_subj(&audit_sig_lsm); } return audit_signal_info_syscall(t); } /** * __audit_log_end - enqueue one audit record * @skb: the buffer to send */ static void __audit_log_end(struct sk_buff *skb) { struct nlmsghdr *nlh; if (audit_rate_check()) { /* setup the netlink header, see the comments in * kauditd_send_multicast_skb() for length quirks */ nlh = nlmsg_hdr(skb); nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; /* queue the netlink packet */ skb_queue_tail(&audit_queue, skb); } else { audit_log_lost("rate limit exceeded"); kfree_skb(skb); } } /** * audit_log_end - end one audit record * @ab: the audit_buffer * * We can not do a netlink send inside an irq context because it blocks (last * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a * queue and a kthread is scheduled to remove them from the queue outside the * irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; if (!ab) return; while ((skb = skb_dequeue(&ab->skb_list))) __audit_log_end(skb); /* poke the kauditd thread */ wake_up_interruptible(&kauditd_wait); audit_buffer_free(ab); } /** * audit_log - Log an audit record * @ctx: audit context * @gfp_mask: type of allocation * @type: audit message type * @fmt: format string to use * @...: variable parameters matching the format string * * This is a convenience function that calls audit_log_start, * audit_log_vformat, and audit_log_end. It may be called * in any context. */ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { struct audit_buffer *ab; va_list args; ab = audit_log_start(ctx, gfp_mask, type); if (ab) { va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); audit_log_end(ab); } } EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); EXPORT_SYMBOL(audit_log);
11 1 4 6 5 2 5 4567 1293 4572 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> #include <net/flow.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_flow_table.h> struct nft_flow_offload { struct nft_flowtable *flowtable; }; static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) return true; if (family == NFPROTO_IPV4) { const struct ip_options *opt; opt = &(IPCB(skb)->opt); if (unlikely(opt->optlen)) return true; } return false; } static void flow_offload_ct_tcp(struct nf_conn *ct) { /* conntrack will not see all packets, disable tcp window validation. */ spin_lock_bh(&ct->lock); ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; spin_unlock_bh(&ct->lock); } static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; struct tcphdr _tcph, *tcph = NULL; struct nf_flow_route route = {}; enum ip_conntrack_info ctinfo; struct flow_offload *flow; enum ip_conntrack_dir dir; struct nf_conn *ct; int ret; if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) goto out; ct = nf_ct_get(pkt->skb, &ctinfo); if (!ct) goto out; switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst || !nf_conntrack_tcp_established(ct))) goto out; break; case IPPROTO_UDP: break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: { struct nf_conntrack_tuple *tuple; if (ct->status & IPS_NAT_MASK) goto out; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* No support for GRE v1 */ if (tuple->src.u.gre.key || tuple->dst.u.gre.key) goto out; break; } #endif default: goto out; } if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) goto out; if (!nf_ct_is_confirmed(ct)) goto out; if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) goto out; dir = CTINFO2DIR(ctinfo); if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) goto err_flow_route; flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; flow_offload_route_init(flow, &route); if (tcph) flow_offload_ct_tcp(ct); __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; return; err_flow_add: flow_offload_free(flow); err_flow_alloc: dst_release(route.tuple[dir].dst); dst_release(route.tuple[!dir].dst); err_flow_route: clear_bit(IPS_OFFLOAD_BIT, &ct->status); out: regs->verdict.code = NFT_BREAK; } static int nft_flow_offload_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { unsigned int hook_mask = (1 << NF_INET_FORWARD); if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; return nft_chain_validate_hooks(ctx->chain, hook_mask); } static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, }; static int nft_flow_offload_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_flow_offload *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_flowtable *flowtable; if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; flowtable = nft_flowtable_lookup(ctx->net, ctx->table, tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); if (!nft_use_inc(&flowtable->use)) return -EMFILE; priv->flowtable = flowtable; return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_flow_offload *priv = nft_expr_priv(expr); nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); } static void nft_flow_offload_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_flow_offload *priv = nft_expr_priv(expr); nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, ctx->family); } static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_flow_offload *priv = nft_expr_priv(expr); if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_flow_offload_type; static const struct nft_expr_ops nft_flow_offload_ops = { .type = &nft_flow_offload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, .activate = nft_flow_offload_activate, .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_flow_offload_type __read_mostly = { .name = "flow_offload", .ops = &nft_flow_offload_ops, .policy = nft_flow_offload_policy, .maxattr = NFTA_FLOW_MAX, .owner = THIS_MODULE, }; static int flow_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (event != NETDEV_DOWN) return NOTIFY_DONE; nf_flow_table_cleanup(dev); return NOTIFY_DONE; } static struct notifier_block flow_offload_netdev_notifier = { .notifier_call = flow_offload_netdev_event, }; static int __init nft_flow_offload_module_init(void) { int err; err = register_netdevice_notifier(&flow_offload_netdev_notifier); if (err) goto err; err = nft_register_expr(&nft_flow_offload_type); if (err < 0) goto register_expr; return 0; register_expr: unregister_netdevice_notifier(&flow_offload_netdev_notifier); err: return err; } static void __exit nft_flow_offload_module_exit(void) { nft_unregister_expr(&nft_flow_offload_type); unregister_netdevice_notifier(&flow_offload_netdev_notifier); } module_init(nft_flow_offload_module_init); module_exit(nft_flow_offload_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("flow_offload"); MODULE_DESCRIPTION("nftables hardware flow offload module");
397 10 87 412 150 399 299 191 395 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_GENERIC_NETLINK_H #define __NET_GENERIC_NETLINK_H #include <linux/net.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <uapi/linux/genetlink.h> #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN) /* Non-parallel generic netlink requests are serialized by a global lock. */ void genl_lock(void); void genl_unlock(void); #define MODULE_ALIAS_GENL_FAMILY(family) \ MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family) /* Binding to multicast group requires %CAP_NET_ADMIN */ #define GENL_MCAST_CAP_NET_ADMIN BIT(0) /* Binding to multicast group requires %CAP_SYS_ADMIN */ #define GENL_MCAST_CAP_SYS_ADMIN BIT(1) /** * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family * @flags: GENL_MCAST_* flags */ struct genl_multicast_group { char name[GENL_NAMSIZ]; u8 flags; }; struct genl_split_ops; struct genl_info; /** * struct genl_family - generic netlink family * @hdrsize: length of user specific header in bytes * @name: name of family * @version: protocol version * @maxattr: maximum number of attributes supported * @policy: netlink policy * @netnsok: set to true if the family can handle network * namespaces and should be presented in all of them * @parallel_ops: operations can be called in parallel and aren't * synchronized by the core genetlink code * @pre_doit: called before an operation's doit callback, it may * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks * @bind: called when family multicast group is added to a netlink socket * @unbind: called when family multicast group is removed from a netlink socket * @module: pointer to the owning module (set to THIS_MODULE) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @resv_start_op: first operation for which reserved fields of the header * can be validated and policies are required (see below); * new families should leave this field at zero * @ops: the operations supported by this family * @n_ops: number of operations supported by this family * @small_ops: the small-struct operations supported by this family * @n_small_ops: number of small-struct operations supported by this family * @split_ops: the split do/dump form of operation definition * @n_split_ops: number of entries in @split_ops, note that with split do/dump * ops the number of entries is not the same as number of commands * @sock_priv_size: the size of per-socket private memory * @sock_priv_init: the per-socket private memory initializer * @sock_priv_destroy: the per-socket private memory destructor * * Attribute policies (the combination of @policy and @maxattr fields) * can be attached at the family level or at the operation level. * If both are present the per-operation policy takes precedence. * For operations before @resv_start_op lack of policy means that the core * will perform no attribute parsing or validation. For newer operations * if policy is not provided core will reject all TLV attributes. */ struct genl_family { unsigned int hdrsize; char name[GENL_NAMSIZ]; unsigned int version; unsigned int maxattr; u8 netnsok:1; u8 parallel_ops:1; u8 n_ops; u8 n_small_ops; u8 n_split_ops; u8 n_mcgrps; u8 resv_start_op; const struct nla_policy *policy; int (*pre_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); int (*bind)(int mcgrp); void (*unbind)(int mcgrp); const struct genl_ops * ops; const struct genl_small_ops *small_ops; const struct genl_split_ops *split_ops; const struct genl_multicast_group *mcgrps; struct module *module; size_t sock_priv_size; void (*sock_priv_init)(void *priv); void (*sock_priv_destroy)(void *priv); /* private: internal use only */ /* protocol family identifier */ int id; /* starting number of multicast group IDs in this family */ unsigned int mcgrp_offset; /* list of per-socket privs */ struct xarray *sock_privs; }; /** * struct genl_info - receiving information * @snd_seq: sending sequence number * @snd_portid: netlink portid of sender * @family: generic netlink family * @nlhdr: netlink message header * @genlhdr: generic netlink message header * @attrs: netlink attributes * @_net: network namespace * @ctx: storage space for the use by the family * @user_ptr: user pointers (deprecated, use ctx instead) * @extack: extended ACK report struct */ struct genl_info { u32 snd_seq; u32 snd_portid; const struct genl_family *family; const struct nlmsghdr * nlhdr; struct genlmsghdr * genlhdr; struct nlattr ** attrs; possible_net_t _net; union { u8 ctx[NETLINK_CTX_SIZE]; void * user_ptr[2]; }; struct netlink_ext_ack *extack; }; static inline struct net *genl_info_net(const struct genl_info *info) { return read_pnet(&info->_net); } static inline void genl_info_net_set(struct genl_info *info, struct net *net) { write_pnet(&info->_net, net); } static inline void *genl_info_userhdr(const struct genl_info *info) { return (u8 *)info->genlhdr + GENL_HDRLEN; } #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) #define GENL_SET_ERR_MSG_FMT(info, msg, args...) \ NL_SET_ERR_MSG_FMT((info)->extack, msg, ##args) /* Report that a root attribute is missing */ #define GENL_REQ_ATTR_CHECK(info, attr) ({ \ const struct genl_info *__info = (info); \ \ NL_REQ_ATTR_CHECK(__info->extack, NULL, __info->attrs, (attr)); \ }) enum genl_validate_flags { GENL_DONT_VALIDATE_STRICT = BIT(0), GENL_DONT_VALIDATE_DUMP = BIT(1), GENL_DONT_VALIDATE_DUMP_STRICT = BIT(2), }; /** * struct genl_small_ops - generic netlink operations (small version) * @cmd: command identifier * @internal_flags: flags used by the family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @validate: validation flags from enum genl_validate_flags * @doit: standard command callback * @dumpit: callback for dumpers * * This is a cut-down version of struct genl_ops for users who don't need * most of the ancillary infra and want to save space. */ struct genl_small_ops { int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); u8 cmd; u8 internal_flags; u8 flags; u8 validate; }; /** * struct genl_ops - generic netlink operations * @cmd: command identifier * @internal_flags: flags used by the family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @maxattr: maximum number of attributes supported * @policy: netlink policy (takes precedence over family policy) * @validate: validation flags from enum genl_validate_flags * @doit: standard command callback * @start: start callback for dumps * @dumpit: callback for dumpers * @done: completion callback for dumps */ struct genl_ops { int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*start)(struct netlink_callback *cb); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); const struct nla_policy *policy; unsigned int maxattr; u8 cmd; u8 internal_flags; u8 flags; u8 validate; }; /** * struct genl_split_ops - generic netlink operations (do/dump split version) * @cmd: command identifier * @internal_flags: flags used by the family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @validate: validation flags from enum genl_validate_flags * @policy: netlink policy (takes precedence over family policy) * @maxattr: maximum number of attributes supported * * Do callbacks: * @pre_doit: called before an operation's @doit callback, it may * do additional, common, filtering and return an error * @doit: standard command callback * @post_doit: called after an operation's @doit callback, it may * undo operations done by pre_doit, for example release locks * * Dump callbacks: * @start: start callback for dumps * @dumpit: callback for dumpers * @done: completion callback for dumps * * Do callbacks can be used if %GENL_CMD_CAP_DO is set in @flags. * Dump callbacks can be used if %GENL_CMD_CAP_DUMP is set in @flags. * Exactly one of those flags must be set. */ struct genl_split_ops { union { struct { int (*pre_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); int (*doit)(struct sk_buff *skb, struct genl_info *info); void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); }; struct { int (*start)(struct netlink_callback *cb); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); }; }; const struct nla_policy *policy; unsigned int maxattr; u8 cmd; u8 internal_flags; u8 flags; u8 validate; }; /** * struct genl_dumpit_info - info that is available during dumpit op call * @op: generic netlink ops - for internal genl code usage * @attrs: netlink attributes * @info: struct genl_info describing the request */ struct genl_dumpit_info { struct genl_split_ops op; struct genl_info info; }; static inline const struct genl_dumpit_info * genl_dumpit_info(struct netlink_callback *cb) { return cb->data; } static inline const struct genl_info * genl_info_dump(struct netlink_callback *cb) { return &genl_dumpit_info(cb)->info; } /** * genl_info_init_ntf() - initialize genl_info for notifications * @info: genl_info struct to set up * @family: pointer to the genetlink family * @cmd: command to be used in the notification * * Initialize a locally declared struct genl_info to pass to various APIs. * Intended to be used when creating notifications. */ static inline void genl_info_init_ntf(struct genl_info *info, const struct genl_family *family, u8 cmd) { struct genlmsghdr *hdr = (void *) &info->user_ptr[0]; memset(info, 0, sizeof(*info)); info->family = family; info->genlhdr = hdr; hdr->cmd = cmd; } static inline bool genl_info_is_ntf(const struct genl_info *info) { return !info->nlhdr; } void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk); void *genl_sk_priv_get(struct genl_family *family, struct sock *sk); int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, const struct genl_family *family, int flags, u8 cmd); static inline void * __genlmsg_iput(struct sk_buff *skb, const struct genl_info *info, int flags) { return genlmsg_put(skb, info->snd_portid, info->snd_seq, info->family, flags, info->genlhdr->cmd); } /** * genlmsg_iput - start genetlink message based on genl_info * @skb: skb in which message header will be placed * @info: genl_info as provided to do/dump handlers * * Convenience wrapper which starts a genetlink message based on * information in user request. @info should be either the struct passed * by genetlink core to do/dump handlers (when constructing replies to * such requests) or a struct initialized by genl_info_init_ntf() * when constructing notifications. * * Returns: pointer to new genetlink header. */ static inline void * genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) { return __genlmsg_iput(skb, info, 0); } /** * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() * * Returns: pointer to netlink header. */ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) { return (struct nlmsghdr *)((char *)user_hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_parse_deprecated - parse attributes of a genetlink message * @nlh: netlink message header * @family: genetlink message family * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh, const struct genl_family *family, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * genlmsg_parse - parse attributes of a genetlink message * @nlh: netlink message header * @family: genetlink message family * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int genlmsg_parse(const struct nlmsghdr *nlh, const struct genl_family *family, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * genl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number * @user_hdr: user header as returned from genlmsg_put() * * Cf. nl_dump_check_consistent(), this just provides a wrapper to make it * simpler to use with generic netlink. */ static inline void genl_dump_check_consistent(struct netlink_callback *cb, void *user_hdr) { nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr)); } /** * genlmsg_put_reply - Add generic netlink header to a reply message * @skb: socket buffer holding the message * @info: receiver info * @family: generic netlink family * @flags: netlink message flags * @cmd: generic netlink command * * Returns: pointer to user specific header */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, const struct genl_family *family, int flags, u8 cmd) { return genlmsg_put(skb, info->snd_portid, info->snd_seq, family, flags, cmd); } /** * genlmsg_end - Finalize a generic netlink message * @skb: socket buffer the message is stored in * @hdr: user specific header */ static inline void genlmsg_end(struct sk_buff *skb, void *hdr) { nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_cancel - Cancel construction of a generic netlink message * @skb: socket buffer the message is stored in * @hdr: generic netlink message header */ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) { if (hdr) nlmsg_cancel(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_multicast_netns_filtered - multicast a netlink message * to a specific netns with filter * function * @family: the generic netlink family * @net: the net namespace * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags * @filter: filter function * @filter_data: filter function private data * * Return: 0 on success, negative error code for failure. */ static inline int genlmsg_multicast_netns_filtered(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags, netlink_filter_fn filter, void *filter_data) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, flags, filter, filter_data); } /** * genlmsg_multicast_netns - multicast a netlink message to a specific netns * @family: the generic netlink family * @net: the net namespace * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags */ static inline int genlmsg_multicast_netns(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return genlmsg_multicast_netns_filtered(family, net, skb, portid, group, flags, NULL, NULL); } /** * genlmsg_multicast - multicast a netlink message to the default netns * @family: the generic netlink family * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags */ static inline int genlmsg_multicast(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return genlmsg_multicast_netns(family, &init_net, skb, portid, group, flags); } /** * genlmsg_multicast_allns - multicast a netlink message to all net namespaces * @family: the generic netlink family * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * * This function must hold the RTNL or rcu_read_lock(). */ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group); /** * genlmsg_unicast - unicast a netlink message * @net: network namespace to look up @portid in * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 portid) { return nlmsg_unicast(net->genl_sock, skb, portid); } /** * genlmsg_reply - reply to a request * @skb: netlink message to be sent back * @info: receiver information */ static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info) { return genlmsg_unicast(genl_info_net(info), skb, info->snd_portid); } /** * genlmsg_data - head of message payload * @gnlh: genetlink message header */ static inline void *genlmsg_data(const struct genlmsghdr *gnlh) { return ((unsigned char *) gnlh + GENL_HDRLEN); } /** * genlmsg_len - length of message payload * @gnlh: genetlink message header */ static inline int genlmsg_len(const struct genlmsghdr *gnlh) { struct nlmsghdr *nlh = (struct nlmsghdr *)((unsigned char *)gnlh - NLMSG_HDRLEN); return (nlh->nlmsg_len - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_msg_size - length of genetlink message not including padding * @payload: length of message payload */ static inline int genlmsg_msg_size(int payload) { return GENL_HDRLEN + payload; } /** * genlmsg_total_size - length of genetlink message including padding * @payload: length of message payload */ static inline int genlmsg_total_size(int payload) { return NLMSG_ALIGN(genlmsg_msg_size(payload)); } /** * genlmsg_new - Allocate a new generic netlink message * @payload: size of the message payload * @flags: the type of memory to allocate. */ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags) { return nlmsg_new(genlmsg_total_size(payload), flags); } /** * genl_set_err - report error to genetlink broadcast listeners * @family: the generic netlink family * @net: the network namespace to report the error to * @portid: the PORTID of a process that we want to skip (if any) * @group: the broadcast group that will notice the error * (this is the offset of the multicast group in the groups array) * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_RECV_NO_ENOBUFS socket option. */ static inline int genl_set_err(const struct genl_family *family, struct net *net, u32 portid, u32 group, int code) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return netlink_set_err(net->genl_sock, portid, group, code); } static inline int genl_has_listeners(const struct genl_family *family, struct net *net, unsigned int group) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return netlink_has_listeners(net->genl_sock, group); } #endif /* __NET_GENERIC_NETLINK_H */
4 61 3 1332 16 160 136 2057 9 5 29 6 26 7 1642 809 835 2347 1362 1019 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/l3mdev.h - L3 master device API * Copyright (c) 2015 Cumulus Networks * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> */ #ifndef _NET_L3MDEV_H_ #define _NET_L3MDEV_H_ #include <net/dst.h> #include <net/fib_rules.h> enum l3mdev_type { L3MDEV_TYPE_UNSPEC, L3MDEV_TYPE_VRF, __L3MDEV_TYPE_MAX }; #define L3MDEV_TYPE_MAX (__L3MDEV_TYPE_MAX - 1) typedef int (*lookup_by_table_id_t)(struct net *net, u32 table_d); /** * struct l3mdev_ops - l3mdev operations * * @l3mdev_fib_table: Get FIB table id to use for lookups * * @l3mdev_l3_rcv: Hook in L3 receive path * * @l3mdev_l3_out: Hook in L3 output path * * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations */ struct l3mdev_ops { u32 (*l3mdev_fib_table)(const struct net_device *dev); struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev, struct sk_buff *skb, u16 proto); struct sk_buff * (*l3mdev_l3_out)(struct net_device *dev, struct sock *sk, struct sk_buff *skb, u16 proto); /* IPv6 ops */ struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev, struct flowi6 *fl6); }; #ifdef CONFIG_NET_L3_MASTER_DEV int l3mdev_table_lookup_register(enum l3mdev_type l3type, lookup_by_table_id_t fn); void l3mdev_table_lookup_unregister(enum l3mdev_type l3type, lookup_by_table_id_t fn); int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, u32 table_id); int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg); static inline bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) { return !(fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF) && fl->flowi_l3mdev == iifindex; } static inline bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) { return fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF && fl->flowi_l3mdev == oifindex; } void l3mdev_update_flow(struct net *net, struct flowi *fl); int l3mdev_master_ifindex_rcu(const struct net_device *dev); static inline int l3mdev_master_ifindex(struct net_device *dev) { int ifindex; rcu_read_lock(); ifindex = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); return ifindex; } static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) { struct net_device *dev; int rc = 0; if (ifindex) { rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) rc = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); } return rc; } static inline struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) { /* netdev_master_upper_dev_get_rcu calls * list_first_or_null_rcu to walk the upper dev list. * list_first_or_null_rcu does not handle a const arg. We aren't * making changes, just want the master device from that list so * typecast to remove the const */ struct net_device *dev = (struct net_device *)_dev; struct net_device *master; if (!dev) return NULL; if (netif_is_l3_master(dev)) master = dev; else if (netif_is_l3_slave(dev)) master = netdev_master_upper_dev_get_rcu(dev); else master = NULL; return master; } int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex); static inline int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex) { rcu_read_lock(); ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex); rcu_read_unlock(); return ifindex; } u32 l3mdev_fib_table_rcu(const struct net_device *dev); u32 l3mdev_fib_table_by_index(struct net *net, int ifindex); static inline u32 l3mdev_fib_table(const struct net_device *dev) { u32 tb_id; rcu_read_lock(); tb_id = l3mdev_fib_table_rcu(dev); rcu_read_unlock(); return tb_id; } static inline bool netif_index_is_l3_master(struct net *net, int ifindex) { struct net_device *dev; bool rc = false; if (ifindex == 0) return false; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) rc = netif_is_l3_master(dev); rcu_read_unlock(); return rc; } struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6); static inline struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto) { struct net_device *master = NULL; if (netif_is_l3_slave(skb->dev)) master = netdev_master_upper_dev_get_rcu(skb->dev); else if (netif_is_l3_master(skb->dev) || netif_has_l3_rx_handler(skb->dev)) master = skb->dev; if (master && master->l3mdev_ops->l3mdev_l3_rcv) skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto); return skb; } static inline struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb) { return l3mdev_l3_rcv(skb, AF_INET); } static inline struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) { return l3mdev_l3_rcv(skb, AF_INET6); } static inline struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto) { struct net_device *dev = skb_dst(skb)->dev; if (netif_is_l3_slave(dev)) { struct net_device *master; rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); if (master && master->l3mdev_ops->l3mdev_l3_out) skb = master->l3mdev_ops->l3mdev_l3_out(master, sk, skb, proto); rcu_read_unlock(); } return skb; } static inline struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb) { return l3mdev_l3_out(sk, skb, AF_INET); } static inline struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb) { return l3mdev_l3_out(sk, skb, AF_INET6); } #else static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev) { return 0; } static inline int l3mdev_master_ifindex(struct net_device *dev) { return 0; } static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) { return 0; } static inline int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) { return 0; } static inline int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex) { return 0; } static inline struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) { return NULL; } static inline u32 l3mdev_fib_table_rcu(const struct net_device *dev) { return 0; } static inline u32 l3mdev_fib_table(const struct net_device *dev) { return 0; } static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) { return 0; } static inline bool netif_index_is_l3_master(struct net *net, int ifindex) { return false; } static inline struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6) { return NULL; } static inline struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb) { return skb; } static inline struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) { return skb; } static inline struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb) { return skb; } static inline struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb) { return skb; } static inline int l3mdev_table_lookup_register(enum l3mdev_type l3type, lookup_by_table_id_t fn) { return -EOPNOTSUPP; } static inline void l3mdev_table_lookup_unregister(enum l3mdev_type l3type, lookup_by_table_id_t fn) { } static inline int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, u32 table_id) { return -ENODEV; } static inline int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg) { return 1; } static inline bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) { return false; } static inline bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) { return false; } static inline void l3mdev_update_flow(struct net *net, struct flowi *fl) { } #endif #endif /* _NET_L3MDEV_H_ */
249 249 41 55 607 10 5928 38 96 805 848 645 672 674 672 1690 17672 30 40 673 232 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 /* SPDX-License-Identifier: GPL-2.0 */ /* * Filesystem access notification for Linux * * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #ifndef __LINUX_FSNOTIFY_BACKEND_H #define __LINUX_FSNOTIFY_BACKEND_H #ifdef __KERNEL__ #include <linux/idr.h> /* inotify uses this */ #include <linux/fs.h> /* struct inode */ #include <linux/list.h> #include <linux/path.h> /* struct path */ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/mempool.h> #include <linux/sched/mm.h> /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily * convert between them. dnotify only needs conversion at watch creation * so no perf loss there. fanotify isn't defined yet, so it can use the * wholes if it needs more events. */ #define FS_ACCESS 0x00000001 /* File was accessed */ #define FS_MODIFY 0x00000002 /* File was modified */ #define FS_ATTRIB 0x00000004 /* Metadata changed */ #define FS_CLOSE_WRITE 0x00000008 /* Writable file was closed */ #define FS_CLOSE_NOWRITE 0x00000010 /* Unwritable file closed */ #define FS_OPEN 0x00000020 /* File was opened */ #define FS_MOVED_FROM 0x00000040 /* File was moved from X */ #define FS_MOVED_TO 0x00000080 /* File was moved to Y */ #define FS_CREATE 0x00000100 /* Subfile was created */ #define FS_DELETE 0x00000200 /* Subfile was deleted */ #define FS_DELETE_SELF 0x00000400 /* Self was deleted */ #define FS_MOVE_SELF 0x00000800 /* Self was moved */ #define FS_OPEN_EXEC 0x00001000 /* File was opened for exec */ #define FS_UNMOUNT 0x00002000 /* inode on umount fs */ #define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ #define FS_ERROR 0x00008000 /* Filesystem Error (fanotify) */ /* * FS_IN_IGNORED overloads FS_ERROR. It is only used internally by inotify * which does not support FS_ERROR. */ #define FS_IN_IGNORED 0x00008000 /* last inotify event here */ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ /* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ #define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */ #define FS_MNT_ATTACH 0x01000000 /* Mount was attached */ #define FS_MNT_DETACH 0x02000000 /* Mount was detached */ #define FS_MNT_MOVE (FS_MNT_ATTACH | FS_MNT_DETACH) /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. * Set on inode/sb/mount marks that care about parent/name info. */ #define FS_EVENT_ON_CHILD 0x08000000 #define FS_RENAME 0x10000000 /* File was renamed */ #define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */ #define FS_ISDIR 0x40000000 /* event occurred against dir */ #define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO) /* * Directory entry modification events - reported only to directory * where entry is modified and not to a watching parent. * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event * when a directory entry inside a child subdir changes. */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) /* Mount namespace events */ #define FSNOTIFY_MNT_EVENTS (FS_MNT_ATTACH | FS_MNT_DETACH) /* Content events can be used to inspect file content */ #define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \ FS_ACCESS_PERM) /* Pre-content events can be used to fill file content */ #define FSNOTIFY_PRE_CONTENT_EVENTS (FS_PRE_ACCESS) #define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \ FSNOTIFY_PRE_CONTENT_EVENTS) /* * This is a list of all events that may get sent to a parent that is watching * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory. */ #define FS_EVENTS_POSS_ON_CHILD (ALL_FSNOTIFY_PERM_EVENTS | \ FS_ACCESS | FS_MODIFY | FS_ATTRIB | \ FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \ FS_OPEN | FS_OPEN_EXEC) /* * This is a list of all events that may get sent with the parent inode as the * @to_tell argument of fsnotify(). * It may include events that can be sent to an inode/sb/mount mark, but cannot * be sent to a parent watching children. */ #define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD) /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ FSNOTIFY_MNT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ FS_ERROR) /* Extra flags that may be reported with event or control handling of events */ #define ALL_FSNOTIFY_FLAGS (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT) #define ALL_FSNOTIFY_BITS (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS) struct fsnotify_group; struct fsnotify_event; struct fsnotify_mark; struct fsnotify_event_private_data; struct fsnotify_fname; struct fsnotify_iter_info; struct mem_cgroup; /* * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. * * handle_event - main call for a group to handle an fs event * @group: group to notify * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @cookie: inotify rename cookie * @iter_info: array of marks from this group that are interested in the event * * handle_inode_event - simple variant of handle_event() for groups that only * have inode marks and don't have ignore mask * @mark: mark to notify * @mask: event type and flags * @inode: inode that event happened on * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to. * Either @inode or @dir must be non-NULL. * @file_name: optional file name associated with event * @cookie: inotify rename cookie * * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group * MUST be holding a reference on each mark and that reference must be * dropped in this function. inotify uses this function to send * userspace messages that marks have been removed. */ struct fsnotify_ops { int (*handle_event)(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info); int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event); /* called on final put+free to free memory */ void (*free_mark)(struct fsnotify_mark *mark); }; /* * all of the information about the original object we want to now send to * a group. If you want to carry more info from the accessing task to the * listener this structure is where you need to be adding fields. */ struct fsnotify_event { struct list_head list; }; /* * fsnotify group priorities. * Events are sent in order from highest priority to lowest priority. */ enum fsnotify_group_prio { FSNOTIFY_PRIO_NORMAL = 0, /* normal notifiers, no permissions */ FSNOTIFY_PRIO_CONTENT, /* fanotify permission events */ FSNOTIFY_PRIO_PRE_CONTENT, /* fanotify pre-content events */ __FSNOTIFY_PRIO_NUM }; /* * A group is a "thing" that wants to receive notification about filesystem * events. The mask holds the subset of event types this group cares about. * refcnt on a group is up to the implementor and at any moment if it goes 0 * everything will be cleaned up. */ struct fsnotify_group { const struct fsnotify_ops *ops; /* how this group handles things */ /* * How the refcnt is used is up to each group. When the refcnt hits 0 * fsnotify will clean up all of the resources associated with this group. * As an example, the dnotify group will always have a refcnt=1 and that * will never change. Inotify, on the other hand, has a group per * inotify_init() and the refcnt will hit 0 only when that fd has been * closed. */ refcount_t refcnt; /* things with interest in this group */ /* needed to send notification to userspace */ spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */ unsigned int q_len; /* events on the queue */ unsigned int max_events; /* maximum events allowed on the list */ enum fsnotify_group_prio priority; /* priority for sending events */ bool shutdown; /* group is being shut down, don't queue more events */ #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ #define FSNOTIFY_GROUP_DUPS 0x02 /* allow multiple marks per object */ int flags; unsigned int owner_flags; /* stored flags of mark_mutex owner */ /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ atomic_t user_waits; /* Number of tasks waiting for user * response */ struct list_head marks_list; /* all inode marks for this group */ struct fasync_struct *fsn_fa; /* async notification */ struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ struct mem_cgroup *memcg; /* memcg to charge allocations */ struct user_namespace *user_ns; /* user ns where group was created */ /* groups can define private fields here or use the void *private */ union { void *private; #ifdef CONFIG_INOTIFY_USER struct inotify_group_private_data { spinlock_t idr_lock; struct idr idr; struct ucounts *ucounts; } inotify_data; #endif #ifdef CONFIG_FANOTIFY struct fanotify_group_private_data { /* Hash table of events for merge */ struct hlist_head *merge_hash; /* allows a group to block waiting for a userspace response */ struct list_head access_list; wait_queue_head_t access_waitq; int flags; /* flags from fanotify_init() */ int f_flags; /* event_f_flags from fanotify_init() */ struct ucounts *ucounts; mempool_t error_events_pool; /* chained on perm_group_list */ struct list_head perm_grp_list; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; }; /* * These helpers are used to prevent deadlock when reclaiming inodes with * evictable marks of the same group that is allocating a new mark. */ static inline void fsnotify_group_lock(struct fsnotify_group *group) { mutex_lock(&group->mark_mutex); group->owner_flags = memalloc_nofs_save(); } static inline void fsnotify_group_unlock(struct fsnotify_group *group) { memalloc_nofs_restore(group->owner_flags); mutex_unlock(&group->mark_mutex); } static inline void fsnotify_group_assert_locked(struct fsnotify_group *group) { WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, FSNOTIFY_EVENT_FILE_RANGE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, FSNOTIFY_EVENT_MNT, FSNOTIFY_EVENT_ERROR, }; struct fs_error_report { int error; struct inode *inode; struct super_block *sb; }; struct file_range { const struct path *path; loff_t pos; size_t count; }; static inline const struct path *file_range_path(const struct file_range *range) { return range->path; } struct fsnotify_mnt { const struct mnt_namespace *ns; u64 mnt_id; }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return (struct inode *)data; case FSNOTIFY_EVENT_DENTRY: return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); case FSNOTIFY_EVENT_FILE_RANGE: return d_inode(file_range_path(data)->dentry); case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *)data)->inode; default: return NULL; } } static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_DENTRY: /* Non const is needed for dget() */ return (struct dentry *)data; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry; case FSNOTIFY_EVENT_FILE_RANGE: return file_range_path(data)->dentry; default: return NULL; } } static inline const struct path *fsnotify_data_path(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_PATH: return data; case FSNOTIFY_EVENT_FILE_RANGE: return file_range_path(data); default: return NULL; } } static inline struct super_block *fsnotify_data_sb(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return ((struct inode *)data)->i_sb; case FSNOTIFY_EVENT_DENTRY: return ((struct dentry *)data)->d_sb; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry->d_sb; case FSNOTIFY_EVENT_FILE_RANGE: return file_range_path(data)->dentry->d_sb; case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *) data)->sb; default: return NULL; } } static inline const struct fsnotify_mnt *fsnotify_data_mnt(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_MNT: return data; default: return NULL; } } static inline u64 fsnotify_data_mnt_id(const void *data, int data_type) { const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); return mnt_data ? mnt_data->mnt_id : 0; } static inline struct fs_error_report *fsnotify_data_error_report( const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_ERROR: return (struct fs_error_report *) data; default: return NULL; } } static inline const struct file_range *fsnotify_data_file_range( const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_FILE_RANGE: return (struct file_range *)data; default: return NULL; } } /* * Index to merged marks iterator array that correlates to a type of watch. * The type of watched object can be deduced from the iterator type, but not * the other way around, because an event can match different watched objects * of the same object type. * For example, both parent and child are watching an object of type inode. */ enum fsnotify_iter_type { FSNOTIFY_ITER_TYPE_INODE, FSNOTIFY_ITER_TYPE_VFSMOUNT, FSNOTIFY_ITER_TYPE_SB, FSNOTIFY_ITER_TYPE_PARENT, FSNOTIFY_ITER_TYPE_INODE2, FSNOTIFY_ITER_TYPE_MNTNS, FSNOTIFY_ITER_TYPE_COUNT }; /* The type of object that a mark is attached to */ enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_ANY = -1, FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, FSNOTIFY_OBJ_TYPE_MNTNS, FSNOTIFY_OBJ_TYPE_COUNT, FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; static inline bool fsnotify_valid_obj_type(unsigned int obj_type) { return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT); } struct fsnotify_iter_info { struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT]; struct fsnotify_group *current_group; unsigned int report_mask; int srcu_idx; }; static inline bool fsnotify_iter_should_report_type( struct fsnotify_iter_info *iter_info, int iter_type) { return (iter_info->report_mask & (1U << iter_type)); } static inline void fsnotify_iter_set_report_type( struct fsnotify_iter_info *iter_info, int iter_type) { iter_info->report_mask |= (1U << iter_type); } static inline struct fsnotify_mark *fsnotify_iter_mark( struct fsnotify_iter_info *iter_info, int iter_type) { if (fsnotify_iter_should_report_type(iter_info, iter_type)) return iter_info->marks[iter_type]; return NULL; } static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type, struct fsnotify_mark **markp) { while (type < FSNOTIFY_ITER_TYPE_COUNT) { *markp = fsnotify_iter_mark(iter, type); if (*markp) break; type++; } return type; } #define FSNOTIFY_ITER_FUNCS(name, NAME) \ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \ struct fsnotify_iter_info *iter_info) \ { \ return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \ } FSNOTIFY_ITER_FUNCS(inode, INODE) FSNOTIFY_ITER_FUNCS(parent, PARENT) FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT) FSNOTIFY_ITER_FUNCS(sb, SB) #define fsnotify_foreach_iter_type(type) \ for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++) #define fsnotify_foreach_iter_mark_type(iter, mark, type) \ for (type = 0; \ type = fsnotify_iter_step(iter, type, &mark), \ type < FSNOTIFY_ITER_TYPE_COUNT; \ type++) /* * Inode/vfsmount/sb point to this structure which tracks all marks attached to * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this * structure. We destroy this structure when there are no more marks attached * to it. The structure is protected by fsnotify_mark_srcu. */ struct fsnotify_mark_connector { spinlock_t lock; unsigned char type; /* Type of object [lock] */ unsigned char prio; /* Highest priority group */ #define FSNOTIFY_CONN_FLAG_IS_WATCHED 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ union { /* Object pointer [lock] */ void *obj; /* Used listing heads to free after srcu period expires */ struct fsnotify_mark_connector *destroy_next; }; struct hlist_head list; }; /* * Container for per-sb fsnotify state (sb marks and more). * Attached lazily on first marked object on the sb and freed when killing sb. */ struct fsnotify_sb_info { struct fsnotify_mark_connector __rcu *sb_marks; /* * Number of inode/mount/sb objects that are being watched in this sb. * Note that inodes objects are currently double-accounted. * * The value in watched_objects[prio] is the number of objects that are * watched by groups of priority >= prio, so watched_objects[0] is the * total number of watched objects in this sb. */ atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM]; }; static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) { #ifdef CONFIG_FSNOTIFY return READ_ONCE(sb->s_fsnotify_info); #else return NULL; #endif } static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb) { return &fsnotify_sb_info(sb)->watched_objects[0]; } /* * A mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events * of a type matching mask or only interested in those events. * * These are flushed when an inode is evicted from core and may be flushed * when the inode is modified (as seen by fsnotify_access). Some fsnotify * users (such as dnotify) will flush these when the open fd is closed and not * at inode eviction or modification. * * Text in brackets is showing the lock(s) protecting modifications of a * particular entry. obj_lock means either inode->i_lock or * mnt->mnt_root->d_lock depending on the mark type. */ struct fsnotify_mark { /* Mask this mark is for [mark->lock, group->mark_mutex] */ __u32 mask; /* We hold one for presence in g_list. Also one ref for each 'thing' * in kernel that found and may be using this mark. */ refcount_t refcnt; /* Group this mark is for. Set on mark creation, stable until last ref * is dropped */ struct fsnotify_group *group; /* List of marks by group->marks_list. Also reused for queueing * mark into destroy_list when it's waiting for the end of SRCU period * before it can be freed. [group->mark_mutex] */ struct list_head g_list; /* Protects inode / mnt pointers, flags, masks */ spinlock_t lock; /* List of marks for inode / vfsmount [connector->lock, mark ref] */ struct hlist_node obj_list; /* Head of list of marks for an object [mark ref] */ struct fsnotify_mark_connector *connector; /* Events types and flags to ignore [mark->lock, group->mark_mutex] */ __u32 ignore_mask; /* General fsnotify mark flags */ #define FSNOTIFY_MARK_FLAG_ALIVE 0x0001 #define FSNOTIFY_MARK_FLAG_ATTACHED 0x0002 /* inotify mark flags */ #define FSNOTIFY_MARK_FLAG_EXCL_UNLINK 0x0010 #define FSNOTIFY_MARK_FLAG_IN_ONESHOT 0x0020 /* fanotify mark flags */ #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 #define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 #define FSNOTIFY_MARK_FLAG_WEAK_FSID 0x1000 unsigned int flags; /* flags [mark->lock] */ }; #ifdef CONFIG_FSNOTIFY /* called from the vfs helpers */ /* main fsnotify call to send events */ extern int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie); extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type); extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern void fsnotify_sb_delete(struct super_block *sb); extern void __fsnotify_mntns_delete(struct mnt_namespace *mntns); extern void fsnotify_sb_free(struct super_block *sb); extern u32 fsnotify_get_cookie(void); extern void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt); static inline __u32 fsnotify_parent_needed_mask(__u32 mask) { /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */ if (!(mask & FS_EVENT_ON_CHILD)) return 0; /* * This object might be watched by a mark that cares about parent/name * info, does it care about the specific set of events that can be * reported with parent/name info? */ return mask & FS_EVENTS_POSS_TO_PARENT; } static inline int fsnotify_inode_watches_children(struct inode *inode) { __u32 parent_mask = READ_ONCE(inode->i_fsnotify_mask); /* FS_EVENT_ON_CHILD is set if the inode may care */ if (!(parent_mask & FS_EVENT_ON_CHILD)) return 0; /* this inode might care about child events, does it care about the * specific set of events that can happen on a child? */ return parent_mask & FS_EVENTS_POSS_ON_CHILD; } /* * Update the dentry with a flag indicating the interest of its parent to receive * filesystem events when those events happens to this dentry->d_inode. */ static inline void fsnotify_update_flags(struct dentry *dentry) { assert_spin_locked(&dentry->d_lock); /* * Serialisation of setting PARENT_WATCHED on the dentries is provided * by d_lock. If inotify_inode_watched changes after we have taken * d_lock, the following fsnotify_set_children_dentry_flags call will * find our entry, so it will spin until we complete here, and update * us with the new state. */ if (fsnotify_inode_watches_children(dentry->d_parent->d_inode)) dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; } /* called from fsnotify listeners, such as fanotify or dnotify */ /* create a new group */ extern struct fsnotify_group *fsnotify_alloc_group( const struct fsnotify_ops *ops, int flags); /* get reference to a group */ extern void fsnotify_get_group(struct fsnotify_group *group); /* drop reference on a group from fsnotify_alloc_group */ extern void fsnotify_put_group(struct fsnotify_group *group); /* group destruction begins, stop queuing new events */ extern void fsnotify_group_stop_queueing(struct fsnotify_group *group); /* destroy group */ extern void fsnotify_destroy_group(struct fsnotify_group *group); /* fasync handler function */ extern int fsnotify_fasync(int fd, struct file *file, int on); /* Free event from memory */ extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ extern int fsnotify_insert_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *), void (*insert)(struct fsnotify_group *, struct fsnotify_event *)); static inline int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *)) { return fsnotify_insert_event(group, event, merge, NULL); } /* Queue overflow event to a notification group */ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) { fsnotify_add_event(group, group->overflow_event, NULL); } static inline bool fsnotify_is_overflow_event(u32 mask) { return mask & FS_Q_OVERFLOW; } static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { assert_spin_locked(&group->notification_lock); return list_empty(&group->notification_list); } extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); /* return AND dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group); /* Remove event queued in the notification list */ extern void fsnotify_remove_queued_event(struct fsnotify_group *group, struct fsnotify_event *event); /* functions used to manipulate the marks attached to inodes */ /* * Canonical "ignore mask" including event flags. * * Note the subtle semantic difference from the legacy ->ignored_mask. * ->ignored_mask traditionally only meant which events should be ignored, * while ->ignore_mask also includes flags regarding the type of objects on * which events should be ignored. */ static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark) { __u32 ignore_mask = mark->ignore_mask; /* The event flags in ignore mask take effect */ if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) return ignore_mask; /* * Legacy behavior: * - Always ignore events on dir * - Ignore events on child if parent is watching children */ ignore_mask |= FS_ISDIR; ignore_mask &= ~FS_EVENT_ON_CHILD; ignore_mask |= mark->mask & FS_EVENT_ON_CHILD; return ignore_mask; } /* Legacy ignored_mask - only event types to ignore */ static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark) { return mark->ignore_mask & ALL_FSNOTIFY_EVENTS; } /* * Check if mask (or ignore mask) should be applied depending if victim is a * directory and whether it is reported to a watching parent. */ static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir, int iter_type) { /* Should mask be applied to a directory? */ if (is_dir && !(mask & FS_ISDIR)) return false; /* Should mask be applied to a child? */ if (iter_type == FSNOTIFY_ITER_TYPE_PARENT && !(mask & FS_EVENT_ON_CHILD)) return false; return true; } /* * Effective ignore mask taking into account if event victim is a * directory and whether it is reported to a watching parent. */ static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark, bool is_dir, int iter_type) { __u32 ignore_mask = fsnotify_ignored_events(mark); if (!ignore_mask) return 0; /* For non-dir and non-child, no need to consult the event flags */ if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT) return ignore_mask; ignore_mask = fsnotify_ignore_mask(mark); if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type)) return 0; return ignore_mask & ALL_FSNOTIFY_EVENTS; } /* Get mask for calculating object interest taking ignore mask into account */ static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark) { __u32 mask = mark->mask; if (!fsnotify_ignored_events(mark)) return mask; /* Interest in FS_MODIFY may be needed for clearing ignore mask */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mask |= FS_MODIFY; /* * If mark is interested in ignoring events on children, the object must * show interest in those events for fsnotify_parent() to notice it. */ return mask | mark->ignore_mask; } /* Get mask of events for a list of marks */ extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn); /* Calculate mask of events for a list of marks */ extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); extern void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* Find mark belonging to given group in the list of marks */ struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, struct fsnotify_group *group); /* attach the mark to the object */ int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags); int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline struct fsnotify_mark *fsnotify_find_inode_mark( struct inode *inode, struct fsnotify_group *group) { return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group); } /* given a group and a mark, flag mark to be freed when all references are dropped */ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* detach mark from inode / mount list, group list, drop inode reference */ extern void fsnotify_detach_mark(struct fsnotify_mark *mark); /* free mark */ extern void fsnotify_free_mark(struct fsnotify_mark *mark); /* Wait until all marks queued for destruction are destroyed */ extern void fsnotify_wait_marks_destroyed(void); /* Clear all of the marks of a group attached to a given object type */ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type); extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); static inline void fsnotify_init_event(struct fsnotify_event *event) { INIT_LIST_HEAD(&event->list); } int fsnotify_pre_content(const struct path *path, const loff_t *ppos, size_t count); #else static inline int fsnotify_pre_content(const struct path *path, const loff_t *ppos, size_t count) { return 0; } static inline int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie) { return 0; } static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { return 0; } static inline void __fsnotify_inode_delete(struct inode *inode) {} static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) {} static inline void fsnotify_sb_delete(struct super_block *sb) {} static inline void __fsnotify_mntns_delete(struct mnt_namespace *mntns) {} static inline void fsnotify_sb_free(struct super_block *sb) {} static inline void fsnotify_update_flags(struct dentry *dentry) {} static inline u32 fsnotify_get_cookie(void) { return 0; } static inline void fsnotify_unmount_inodes(struct super_block *sb) {} static inline void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) {} #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ #endif /* __LINUX_FSNOTIFY_BACKEND_H */
86 38 61 50 76 41 53 53 53 45 45 17 44 144 78 69 16 6 53 40 52 69 95 50 45 25 45 55 53 2 80 24 56 55 56 97 83 30 97 414 467 14 2 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3: Garbage Collector For AF_UNIX sockets * * Garbage Collector: * Copyright (C) Barak A. Pearlmutter. * * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem. * If it doesn't work blame me, it worked when Barak sent it. * * Assumptions: * * - object w/ a bit * - free list * * Current optimizations: * * - explicit stack instead of recursion * - tail recurse on first born instead of immediate push/pop * - we gather the stuff that should not be killed into tree * and stack is just a path from root to the current pointer. * * Future optimizations: * * - don't just push entire root set; process in place * * Fixes: * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed. * Cope with changing max_files. * Al Viro 11 Oct 1998 * Graph may have cycles. That is, we can send the descriptor * of foo to bar and vice versa. Current code chokes on that. * Fix: move SCM_RIGHTS ones into the separate list and then * skb_free() them all instead of doing explicit fput's. * Another problem: since fput() may block somebody may * create a new unix_socket when we are in the middle of sweep * phase. Fix: revert the logic wrt MARKED. Mark everything * upon the beginning and unmark non-junk ones. * * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS * sent to connect()'ed but still not accept()'ed sockets. * Fixed. Old code had slightly different problem here: * extra fput() in situation when we passed the descriptor via * such socket and closed it (descriptor). That would happen on * each unix_gc() until the accept(). Since the struct file in * question would go to the free list and might be reused... * That might be the reason of random oopses on filp_close() * in unrelated processes. * * AV 28 Feb 1999 * Kill the explicit allocation of stack. Now we keep the tree * with root in dummy + pointer (gc_current) to one of the nodes. * Stack is represented as path from gc_current to dummy. Unmark * now means "add to tree". Push == "make it a son of gc_current". * Pop == "move gc_current to parent". We keep only pointers to * parents (->gc_tree). * AV 1 Mar 1999 * Damn. Added missing check for ->dead in listen queues scanning. * * Miklos Szeredi 25 Jun 2007 * Reimplement with a cycle collecting algorithm. This should * solve several problems with the previous code, like being racy * wrt receive and holding up unrelated socket operations. */ #include <linux/fs.h> #include <linux/list.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/workqueue.h> #include <net/af_unix.h> #include <net/scm.h> #include <net/tcp_states.h> #include "af_unix.h" struct unix_vertex { struct list_head edges; struct list_head entry; struct list_head scc_entry; unsigned long out_degree; unsigned long index; unsigned long scc_index; }; struct unix_edge { struct unix_sock *predecessor; struct unix_sock *successor; struct list_head vertex_entry; struct list_head stack_entry; }; struct unix_sock *unix_get_socket(struct file *filp) { struct inode *inode = file_inode(filp); /* Socket ? */ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); const struct proto_ops *ops; struct sock *sk = sock->sk; ops = READ_ONCE(sock->ops); /* PF_UNIX ? */ if (sk && ops && ops->family == PF_UNIX) return unix_sk(sk); } return NULL; } static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) { /* If an embryo socket has a fd, * the listener indirectly holds the fd's refcnt. */ if (edge->successor->listener) return unix_sk(edge->successor->listener)->vertex; return edge->successor->vertex; } enum { UNIX_GRAPH_NOT_CYCLIC, UNIX_GRAPH_MAYBE_CYCLIC, UNIX_GRAPH_CYCLIC, }; static unsigned char unix_graph_state; static void unix_update_graph(struct unix_vertex *vertex) { /* If the receiver socket is not inflight, no cyclic * reference could be formed. */ if (!vertex) return; WRITE_ONCE(unix_graph_state, UNIX_GRAPH_MAYBE_CYCLIC); } static LIST_HEAD(unix_unvisited_vertices); enum unix_vertex_index { UNIX_VERTEX_INDEX_MARK1, UNIX_VERTEX_INDEX_MARK2, UNIX_VERTEX_INDEX_START, }; static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; static unsigned long unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; if (!vertex) { vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); vertex->index = unix_vertex_unvisited_index; vertex->scc_index = ++unix_vertex_max_scc_index; vertex->out_degree = 0; INIT_LIST_HEAD(&vertex->edges); INIT_LIST_HEAD(&vertex->scc_entry); list_move_tail(&vertex->entry, &unix_unvisited_vertices); edge->predecessor->vertex = vertex; } vertex->out_degree++; list_add_tail(&edge->vertex_entry, &vertex->edges); unix_update_graph(unix_edge_successor(edge)); } static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; if (!fpl->dead) unix_update_graph(unix_edge_successor(edge)); list_del(&edge->vertex_entry); vertex->out_degree--; if (!vertex->out_degree) { edge->predecessor->vertex = NULL; list_move_tail(&vertex->entry, &fpl->vertices); } } static void unix_free_vertices(struct scm_fp_list *fpl) { struct unix_vertex *vertex, *next_vertex; list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { list_del(&vertex->entry); kfree(vertex); } } static __cacheline_aligned_in_smp DEFINE_SPINLOCK(unix_gc_lock); void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) { int i = 0, j = 0; spin_lock(&unix_gc_lock); if (!fpl->count_unix) goto out; do { struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); struct unix_edge *edge; if (!inflight) continue; edge = fpl->edges + i++; edge->predecessor = inflight; edge->successor = receiver; unix_add_edge(fpl, edge); } while (i < fpl->count_unix); receiver->scm_stat.nr_unix_fds += fpl->count_unix; out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); spin_unlock(&unix_gc_lock); fpl->inflight = true; unix_free_vertices(fpl); } void unix_del_edges(struct scm_fp_list *fpl) { struct unix_sock *receiver; int i = 0; spin_lock(&unix_gc_lock); if (!fpl->count_unix) goto out; do { struct unix_edge *edge = fpl->edges + i++; unix_del_edge(fpl, edge); } while (i < fpl->count_unix); if (!fpl->dead) { receiver = fpl->edges[0].successor; receiver->scm_stat.nr_unix_fds -= fpl->count_unix; } out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); spin_unlock(&unix_gc_lock); fpl->inflight = false; } void unix_update_edges(struct unix_sock *receiver) { /* nr_unix_fds is only updated under unix_state_lock(). * If it's 0 here, the embryo socket is not part of the * inflight graph, and GC will not see it, so no lock needed. */ if (!receiver->scm_stat.nr_unix_fds) { receiver->listener = NULL; } else { spin_lock(&unix_gc_lock); unix_update_graph(unix_sk(receiver->listener)->vertex); receiver->listener = NULL; spin_unlock(&unix_gc_lock); } } int unix_prepare_fpl(struct scm_fp_list *fpl) { struct unix_vertex *vertex; int i; if (!fpl->count_unix) return 0; for (i = 0; i < fpl->count_unix; i++) { vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); if (!vertex) goto err; list_add(&vertex->entry, &fpl->vertices); } fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), GFP_KERNEL_ACCOUNT); if (!fpl->edges) goto err; unix_schedule_gc(fpl->user); return 0; err: unix_free_vertices(fpl); return -ENOMEM; } void unix_destroy_fpl(struct scm_fp_list *fpl) { if (fpl->inflight) unix_del_edges(fpl); kvfree(fpl->edges); unix_free_vertices(fpl); } static bool unix_vertex_dead(struct unix_vertex *vertex) { struct unix_edge *edge; struct unix_sock *u; long total_ref; list_for_each_entry(edge, &vertex->edges, vertex_entry) { struct unix_vertex *next_vertex = unix_edge_successor(edge); /* The vertex's fd can be received by a non-inflight socket. */ if (!next_vertex) return false; /* The vertex's fd can be received by an inflight socket in * another SCC. */ if (next_vertex->scc_index != vertex->scc_index) return false; } /* No receiver exists out of the same SCC. */ edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); u = edge->predecessor; total_ref = file_count(u->sk.sk_socket->file); /* If not close()d, total_ref > out_degree. */ if (total_ref != vertex->out_degree) return false; return true; } static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) { struct unix_vertex *vertex; list_for_each_entry_reverse(vertex, scc, scc_entry) { struct sk_buff_head *queue; struct unix_edge *edge; struct unix_sock *u; edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); u = edge->predecessor; queue = &u->sk.sk_receive_queue; spin_lock(&queue->lock); if (u->sk.sk_state == TCP_LISTEN) { struct sk_buff *skb; skb_queue_walk(queue, skb) { struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; spin_lock(&embryo_queue->lock); skb_queue_splice_init(embryo_queue, hitlist); spin_unlock(&embryo_queue->lock); } } else { skb_queue_splice_init(queue, hitlist); } spin_unlock(&queue->lock); } } static bool unix_scc_cyclic(struct list_head *scc) { struct unix_vertex *vertex; struct unix_edge *edge; /* SCC containing multiple vertices ? */ if (!list_is_singular(scc)) return true; vertex = list_first_entry(scc, typeof(*vertex), scc_entry); /* Self-reference or a embryo-listener circle ? */ list_for_each_entry(edge, &vertex->edges, vertex_entry) { if (unix_edge_successor(edge) == vertex) return true; } return false; } static LIST_HEAD(unix_visited_vertices); static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; static unsigned long __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, struct sk_buff_head *hitlist) { unsigned long cyclic_sccs = 0; LIST_HEAD(vertex_stack); struct unix_edge *edge; LIST_HEAD(edge_stack); next_vertex: /* Push vertex to vertex_stack and mark it as on-stack * (index >= UNIX_VERTEX_INDEX_START). * The vertex will be popped when finalising SCC later. */ list_add(&vertex->scc_entry, &vertex_stack); vertex->index = *last_index; vertex->scc_index = *last_index; (*last_index)++; /* Explore neighbour vertices (receivers of the current vertex's fd). */ list_for_each_entry(edge, &vertex->edges, vertex_entry) { struct unix_vertex *next_vertex = unix_edge_successor(edge); if (!next_vertex) continue; if (next_vertex->index == unix_vertex_unvisited_index) { /* Iterative deepening depth first search * * 1. Push a forward edge to edge_stack and set * the successor to vertex for the next iteration. */ list_add(&edge->stack_entry, &edge_stack); vertex = next_vertex; goto next_vertex; /* 2. Pop the edge directed to the current vertex * and restore the ancestor for backtracking. */ prev_vertex: edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); list_del_init(&edge->stack_entry); next_vertex = vertex; vertex = edge->predecessor->vertex; /* If the successor has a smaller scc_index, two vertices * are in the same SCC, so propagate the smaller scc_index * to skip SCC finalisation. */ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); } else if (next_vertex->index != unix_vertex_grouped_index) { /* Loop detected by a back/cross edge. * * The successor is on vertex_stack, so two vertices are in * the same SCC. If the successor has a smaller *scc_index*, * propagate it to skip SCC finalisation. */ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); } else { /* The successor was already grouped as another SCC */ } } if (vertex->index == vertex->scc_index) { struct unix_vertex *v; struct list_head scc; bool scc_dead = true; /* SCC finalised. * * If the scc_index was not updated, all the vertices above on * vertex_stack are in the same SCC. Group them using scc_entry. */ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); list_for_each_entry_reverse(v, &scc, scc_entry) { /* Don't restart DFS from this vertex in unix_walk_scc(). */ list_move_tail(&v->entry, &unix_visited_vertices); /* Mark vertex as off-stack. */ v->index = unix_vertex_grouped_index; if (scc_dead) scc_dead = unix_vertex_dead(v); } if (scc_dead) { unix_collect_skb(&scc, hitlist); } else { if (unix_vertex_max_scc_index < vertex->scc_index) unix_vertex_max_scc_index = vertex->scc_index; if (unix_scc_cyclic(&scc)) cyclic_sccs++; } list_del(&scc); } /* Need backtracking ? */ if (!list_empty(&edge_stack)) goto prev_vertex; return cyclic_sccs; } static unsigned long unix_graph_cyclic_sccs; static void unix_walk_scc(struct sk_buff_head *hitlist) { unsigned long last_index = UNIX_VERTEX_INDEX_START; unsigned long cyclic_sccs = 0; unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; /* Visit every vertex exactly once. * __unix_walk_scc() moves visited vertices to unix_visited_vertices. */ while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); cyclic_sccs += __unix_walk_scc(vertex, &last_index, hitlist); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); WRITE_ONCE(unix_graph_cyclic_sccs, cyclic_sccs); WRITE_ONCE(unix_graph_state, cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC); } static void unix_walk_scc_fast(struct sk_buff_head *hitlist) { unsigned long cyclic_sccs = unix_graph_cyclic_sccs; while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; struct list_head scc; bool scc_dead = true; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); list_add(&scc, &vertex->scc_entry); list_for_each_entry_reverse(vertex, &scc, scc_entry) { list_move_tail(&vertex->entry, &unix_visited_vertices); if (scc_dead) scc_dead = unix_vertex_dead(vertex); } if (scc_dead) { cyclic_sccs--; unix_collect_skb(&scc, hitlist); } list_del(&scc); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); WRITE_ONCE(unix_graph_cyclic_sccs, cyclic_sccs); WRITE_ONCE(unix_graph_state, cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC); } static bool gc_in_progress; static void unix_gc(struct work_struct *work) { struct sk_buff_head hitlist; struct sk_buff *skb; spin_lock(&unix_gc_lock); if (unix_graph_state == UNIX_GRAPH_NOT_CYCLIC) { spin_unlock(&unix_gc_lock); goto skip_gc; } __skb_queue_head_init(&hitlist); if (unix_graph_state == UNIX_GRAPH_CYCLIC) unix_walk_scc_fast(&hitlist); else unix_walk_scc(&hitlist); spin_unlock(&unix_gc_lock); skb_queue_walk(&hitlist, skb) { if (UNIXCB(skb).fp) UNIXCB(skb).fp->dead = true; } __skb_queue_purge_reason(&hitlist, SKB_DROP_REASON_SOCKET_CLOSE); skip_gc: WRITE_ONCE(gc_in_progress, false); } static DECLARE_WORK(unix_gc_work, unix_gc); #define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) void unix_schedule_gc(struct user_struct *user) { if (READ_ONCE(unix_graph_state) == UNIX_GRAPH_NOT_CYCLIC) return; /* Penalise users who want to send AF_UNIX sockets * but whose sockets have not been received yet. */ if (user && READ_ONCE(user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) return; if (!READ_ONCE(gc_in_progress)) { WRITE_ONCE(gc_in_progress, true); queue_work(system_dfl_wq, &unix_gc_work); } if (user && READ_ONCE(unix_graph_cyclic_sccs)) flush_work(&unix_gc_work); }
122 122 122 122 122 122 122 122 122 122 122 122 122 122 121 122 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 // SPDX-License-Identifier: GPL-2.0-or-later /* Verify the signature on a PKCS#7 message. * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "PKCS7: "fmt #include <linux/kernel.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/asn1.h> #include <crypto/hash.h> #include <crypto/hash_info.h> #include <crypto/public_key.h> #include "pkcs7_parser.h" /* * Digest the relevant parts of the PKCS#7 data */ static int pkcs7_digest(struct pkcs7_message *pkcs7, struct pkcs7_signed_info *sinfo) { struct public_key_signature *sig = sinfo->sig; struct crypto_shash *tfm; struct shash_desc *desc; size_t desc_size; int ret; kenter(",%u,%s", sinfo->index, sinfo->sig->hash_algo); /* The digest was calculated already. */ if (sig->digest) return 0; if (!sinfo->sig->hash_algo) return -ENOPKG; /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ tfm = crypto_alloc_shash(sinfo->sig->hash_algo, 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? -ENOPKG : PTR_ERR(tfm); desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); sig->digest_size = crypto_shash_digestsize(tfm); ret = -ENOMEM; sig->digest = kmalloc(sig->digest_size, GFP_KERNEL); if (!sig->digest) goto error_no_desc; desc = kzalloc(desc_size, GFP_KERNEL); if (!desc) goto error_no_desc; desc->tfm = tfm; /* Digest the message [RFC2315 9.3] */ ret = crypto_shash_digest(desc, pkcs7->data, pkcs7->data_len, sig->digest); if (ret < 0) goto error; pr_devel("MsgDigest = [%*ph]\n", 8, sig->digest); /* However, if there are authenticated attributes, there must be a * message digest attribute amongst them which corresponds to the * digest we just calculated. */ if (sinfo->authattrs) { u8 tag; if (!sinfo->msgdigest) { pr_warn("Sig %u: No messageDigest\n", sinfo->index); ret = -EKEYREJECTED; goto error; } if (sinfo->msgdigest_len != sig->digest_size) { pr_warn("Sig %u: Invalid digest size (%u)\n", sinfo->index, sinfo->msgdigest_len); ret = -EBADMSG; goto error; } if (memcmp(sig->digest, sinfo->msgdigest, sinfo->msgdigest_len) != 0) { pr_warn("Sig %u: Message digest doesn't match\n", sinfo->index); ret = -EKEYREJECTED; goto error; } /* We then calculate anew, using the authenticated attributes * as the contents of the digest instead. Note that we need to * convert the attributes from a CONT.0 into a SET before we * hash it. */ memset(sig->digest, 0, sig->digest_size); ret = crypto_shash_init(desc); if (ret < 0) goto error; tag = ASN1_CONS_BIT | ASN1_SET; ret = crypto_shash_update(desc, &tag, 1); if (ret < 0) goto error; ret = crypto_shash_finup(desc, sinfo->authattrs, sinfo->authattrs_len, sig->digest); if (ret < 0) goto error; pr_devel("AADigest = [%*ph]\n", 8, sig->digest); } error: kfree(desc); error_no_desc: crypto_free_shash(tfm); kleave(" = %d", ret); return ret; } int pkcs7_get_digest(struct pkcs7_message *pkcs7, const u8 **buf, u32 *len, enum hash_algo *hash_algo) { struct pkcs7_signed_info *sinfo = pkcs7->signed_infos; int i, ret; /* * This function doesn't support messages with more than one signature. */ if (sinfo == NULL || sinfo->next != NULL) return -EBADMSG; ret = pkcs7_digest(pkcs7, sinfo); if (ret) return ret; *buf = sinfo->sig->digest; *len = sinfo->sig->digest_size; i = match_string(hash_algo_name, HASH_ALGO__LAST, sinfo->sig->hash_algo); if (i >= 0) *hash_algo = i; return 0; } /* * Find the key (X.509 certificate) to use to verify a PKCS#7 message. PKCS#7 * uses the issuer's name and the issuing certificate serial number for * matching purposes. These must match the certificate issuer's name (not * subject's name) and the certificate serial number [RFC 2315 6.7]. */ static int pkcs7_find_key(struct pkcs7_message *pkcs7, struct pkcs7_signed_info *sinfo) { struct x509_certificate *x509; unsigned certix = 1; kenter("%u", sinfo->index); for (x509 = pkcs7->certs; x509; x509 = x509->next, certix++) { /* I'm _assuming_ that the generator of the PKCS#7 message will * encode the fields from the X.509 cert in the same way in the * PKCS#7 message - but I can't be 100% sure of that. It's * possible this will need element-by-element comparison. */ if (!asymmetric_key_id_same(x509->id, sinfo->sig->auth_ids[0])) continue; pr_devel("Sig %u: Found cert serial match X.509[%u]\n", sinfo->index, certix); sinfo->signer = x509; return 0; } /* The relevant X.509 cert isn't found here, but it might be found in * the trust keyring. */ pr_debug("Sig %u: Issuing X.509 cert not found (#%*phN)\n", sinfo->index, sinfo->sig->auth_ids[0]->len, sinfo->sig->auth_ids[0]->data); return 0; } /* * Verify the internal certificate chain as best we can. */ static int pkcs7_verify_sig_chain(struct pkcs7_message *pkcs7, struct pkcs7_signed_info *sinfo) { struct public_key_signature *sig; struct x509_certificate *x509 = sinfo->signer, *p; struct asymmetric_key_id *auth; int ret; kenter(""); for (p = pkcs7->certs; p; p = p->next) p->seen = false; for (;;) { pr_debug("verify %s: %*phN\n", x509->subject, x509->raw_serial_size, x509->raw_serial); x509->seen = true; if (x509->blacklisted) { /* If this cert is blacklisted, then mark everything * that depends on this as blacklisted too. */ sinfo->blacklisted = true; for (p = sinfo->signer; p != x509; p = p->signer) p->blacklisted = true; pr_debug("- blacklisted\n"); return 0; } pr_debug("- issuer %s\n", x509->issuer); sig = x509->sig; if (sig->auth_ids[0]) pr_debug("- authkeyid.id %*phN\n", sig->auth_ids[0]->len, sig->auth_ids[0]->data); if (sig->auth_ids[1]) pr_debug("- authkeyid.skid %*phN\n", sig->auth_ids[1]->len, sig->auth_ids[1]->data); if (x509->self_signed) { /* If there's no authority certificate specified, then * the certificate must be self-signed and is the root * of the chain. Likewise if the cert is its own * authority. */ if (x509->unsupported_sig) goto unsupported_sig_in_x509; x509->signer = x509; pr_debug("- self-signed\n"); return 0; } /* Look through the X.509 certificates in the PKCS#7 message's * list to see if the next one is there. */ auth = sig->auth_ids[0]; if (auth) { pr_debug("- want %*phN\n", auth->len, auth->data); for (p = pkcs7->certs; p; p = p->next) { pr_debug("- cmp [%u] %*phN\n", p->index, p->id->len, p->id->data); if (asymmetric_key_id_same(p->id, auth)) goto found_issuer_check_skid; } } else if (sig->auth_ids[1]) { auth = sig->auth_ids[1]; pr_debug("- want %*phN\n", auth->len, auth->data); for (p = pkcs7->certs; p; p = p->next) { if (!p->skid) continue; pr_debug("- cmp [%u] %*phN\n", p->index, p->skid->len, p->skid->data); if (asymmetric_key_id_same(p->skid, auth)) goto found_issuer; } } /* We didn't find the root of this chain */ pr_debug("- top\n"); return 0; found_issuer_check_skid: /* We matched issuer + serialNumber, but if there's an * authKeyId.keyId, that must match the CA subjKeyId also. */ if (sig->auth_ids[1] && !asymmetric_key_id_same(p->skid, sig->auth_ids[1])) { pr_warn("Sig %u: X.509 chain contains auth-skid nonmatch (%u->%u)\n", sinfo->index, x509->index, p->index); return -EKEYREJECTED; } found_issuer: pr_debug("- subject %s\n", p->subject); if (p->seen) { pr_warn("Sig %u: X.509 chain contains loop\n", sinfo->index); return 0; } ret = public_key_verify_signature(p->pub, x509->sig); if (ret < 0) return ret; x509->signer = p; if (x509 == p) { pr_debug("- self-signed\n"); return 0; } x509 = p; might_sleep(); } unsupported_sig_in_x509: /* Just prune the certificate chain at this point if we lack some * crypto module to go further. Note, however, we don't want to set * sinfo->unsupported_crypto as the signed info block may still be * validatable against an X.509 cert lower in the chain that we have a * trusted copy of. */ return 0; } /* * Verify one signed information block from a PKCS#7 message. */ static int pkcs7_verify_one(struct pkcs7_message *pkcs7, struct pkcs7_signed_info *sinfo) { int ret; kenter(",%u", sinfo->index); /* First of all, digest the data in the PKCS#7 message and the * signed information block */ ret = pkcs7_digest(pkcs7, sinfo); if (ret < 0) return ret; /* Find the key for the signature if there is one */ ret = pkcs7_find_key(pkcs7, sinfo); if (ret < 0) return ret; if (!sinfo->signer) return 0; pr_devel("Using X.509[%u] for sig %u\n", sinfo->signer->index, sinfo->index); /* Check that the PKCS#7 signing time is valid according to the X.509 * certificate. We can't, however, check against the system clock * since that may not have been set yet and may be wrong. */ if (test_bit(sinfo_has_signing_time, &sinfo->aa_set)) { if (sinfo->signing_time < sinfo->signer->valid_from || sinfo->signing_time > sinfo->signer->valid_to) { pr_warn("Message signed outside of X.509 validity window\n"); return -EKEYREJECTED; } } /* Verify the PKCS#7 binary against the key */ ret = public_key_verify_signature(sinfo->signer->pub, sinfo->sig); if (ret < 0) return ret; pr_devel("Verified signature %u\n", sinfo->index); /* Verify the internal certificate chain */ return pkcs7_verify_sig_chain(pkcs7, sinfo); } /** * pkcs7_verify - Verify a PKCS#7 message * @pkcs7: The PKCS#7 message to be verified * @usage: The use to which the key is being put * * Verify a PKCS#7 message is internally consistent - that is, the data digest * matches the digest in the AuthAttrs and any signature in the message or one * of the X.509 certificates it carries that matches another X.509 cert in the * message can be verified. * * This does not look to match the contents of the PKCS#7 message against any * external public keys. * * Returns, in order of descending priority: * * (*) -EKEYREJECTED if a key was selected that had a usage restriction at * odds with the specified usage, or: * * (*) -EKEYREJECTED if a signature failed to match for which we found an * appropriate X.509 certificate, or: * * (*) -EBADMSG if some part of the message was invalid, or: * * (*) 0 if a signature chain passed verification, or: * * (*) -EKEYREJECTED if a blacklisted key was encountered, or: * * (*) -ENOPKG if none of the signature chains are verifiable because suitable * crypto modules couldn't be found. */ int pkcs7_verify(struct pkcs7_message *pkcs7, enum key_being_used_for usage) { struct pkcs7_signed_info *sinfo; int actual_ret = -ENOPKG; int ret; kenter(""); switch (usage) { case VERIFYING_MODULE_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid module sig (not pkcs7-data)\n"); return -EKEYREJECTED; } if (pkcs7->have_authattrs) { pr_warn("Invalid module sig (has authattrs)\n"); return -EKEYREJECTED; } break; case VERIFYING_FIRMWARE_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid firmware sig (not pkcs7-data)\n"); return -EKEYREJECTED; } if (!pkcs7->have_authattrs) { pr_warn("Invalid firmware sig (missing authattrs)\n"); return -EKEYREJECTED; } break; case VERIFYING_KEXEC_PE_SIGNATURE: if (pkcs7->data_type != OID_msIndirectData) { pr_warn("Invalid kexec sig (not Authenticode)\n"); return -EKEYREJECTED; } /* Authattr presence checked in parser */ break; case VERIFYING_UNSPECIFIED_SIGNATURE: case VERIFYING_BPF_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid unspecified sig (not pkcs7-data)\n"); return -EKEYREJECTED; } break; default: return -EINVAL; } for (sinfo = pkcs7->signed_infos; sinfo; sinfo = sinfo->next) { ret = pkcs7_verify_one(pkcs7, sinfo); if (sinfo->blacklisted) { if (actual_ret == -ENOPKG) actual_ret = -EKEYREJECTED; continue; } if (ret < 0) { if (ret == -ENOPKG) { sinfo->unsupported_crypto = true; continue; } kleave(" = %d", ret); return ret; } actual_ret = 0; } kleave(" = %d", actual_ret); return actual_ret; } EXPORT_SYMBOL_GPL(pkcs7_verify); /** * pkcs7_supply_detached_data - Supply the data needed to verify a PKCS#7 message * @pkcs7: The PKCS#7 message * @data: The data to be verified * @datalen: The amount of data * * Supply the detached data needed to verify a PKCS#7 message. Note that no * attempt to retain/pin the data is made. That is left to the caller. The * data will not be modified by pkcs7_verify() and will not be freed when the * PKCS#7 message is freed. * * Returns -EINVAL if data is already supplied in the message, 0 otherwise. */ int pkcs7_supply_detached_data(struct pkcs7_message *pkcs7, const void *data, size_t datalen) { if (pkcs7->data) { pr_warn("Data already supplied\n"); return -EINVAL; } pkcs7->data = data; pkcs7->data_len = datalen; return 0; } EXPORT_SYMBOL_GPL(pkcs7_supply_detached_data);
346 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * x86-optimized SHA-512 block function * * Copyright 2025 Google LLC */ #include <asm/fpu/api.h> #include <linux/static_call.h> DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); #define DEFINE_X86_SHA512_FN(c_fn, asm_fn) \ asmlinkage void asm_fn(struct sha512_block_state *state, \ const u8 *data, size_t nblocks); \ static void c_fn(struct sha512_block_state *state, const u8 *data, \ size_t nblocks) \ { \ if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ } else { \ sha512_blocks_generic(state, data, nblocks); \ } \ } DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3); DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx); DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx); static void sha512_blocks(struct sha512_block_state *state, const u8 *data, size_t nblocks) { static_call(sha512_blocks_x86)(state, data, nblocks); } #define sha512_mod_init_arch sha512_mod_init_arch static void sha512_mod_init_arch(void) { if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha512_blocks_x86, sha512_blocks_avx2); else static_call_update(sha512_blocks_x86, sha512_blocks_avx); } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { static_call_update(sha512_blocks_x86, sha512_blocks_ssse3); } }
28 1 1 1 30 30 8 29 12 12 12 12 7 5 5 5 58 9 21 9 7 1 9 34 3 32 32 4 28 28 20 28 28 28 19 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 // SPDX-License-Identifier: GPL-2.0 /* * Support for async notification of waitid */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "cancel.h" #include "waitid.h" #include "../kernel/exit.h" static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) struct io_waitid { struct file *file; int which; pid_t upid; int options; atomic_t refs; struct wait_queue_head *head; struct siginfo __user *infop; struct waitid_info info; }; static void io_waitid_free(struct io_kiocb *req) { struct io_waitid_async *iwa = req->async_data; put_pid(iwa->wo.wo_pid); io_req_async_data_free(req); } static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) { struct compat_siginfo __user *infop; bool ret; infop = (struct compat_siginfo __user *) iw->infop; if (!user_write_access_begin(infop, sizeof(*infop))) return false; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(iw->info.cause, &infop->si_code, Efault); unsafe_put_user(iw->info.pid, &infop->si_pid, Efault); unsafe_put_user(iw->info.uid, &infop->si_uid, Efault); unsafe_put_user(iw->info.status, &infop->si_status, Efault); ret = true; done: user_write_access_end(); return ret; Efault: ret = false; goto done; } static bool io_waitid_copy_si(struct io_kiocb *req, int signo) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); bool ret; if (!iw->infop) return true; if (io_is_compat(req->ctx)) return io_waitid_compat_copy_si(iw, signo); if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) return false; unsafe_put_user(signo, &iw->infop->si_signo, Efault); unsafe_put_user(0, &iw->infop->si_errno, Efault); unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault); unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault); unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault); unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault); ret = true; done: user_write_access_end(); return ret; Efault: ret = false; goto done; } static int io_waitid_finish(struct io_kiocb *req, int ret) { int signo = 0; if (ret > 0) { signo = SIGCHLD; ret = 0; } if (!io_waitid_copy_si(req, signo)) ret = -EFAULT; io_waitid_free(req); return ret; } static void io_waitid_remove_wq(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct wait_queue_head *head; head = READ_ONCE(iw->head); if (head) { struct io_waitid_async *iwa = req->async_data; iw->head = NULL; spin_lock_irq(&head->lock); list_del_init(&iwa->wo.child_wait.entry); spin_unlock_irq(&head->lock); } } static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); lockdep_assert_held(&req->ctx->uring_lock); hlist_del_init(&req->hash_node); io_waitid_remove_wq(req); ret = io_waitid_finish(req, ret); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); } static bool __io_waitid_cancel(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); lockdep_assert_held(&req->ctx->uring_lock); /* * Mark us canceled regardless of ownership. This will prevent a * potential retry from a spurious wakeup. */ atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs); /* claim ownership */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return false; io_waitid_complete(req, -ECANCELED); io_req_queue_tw_complete(req, -ECANCELED); return true; } int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { return io_cancel_remove(ctx, cd, issue_flags, &ctx->waitid_list, __io_waitid_cancel); } bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { return io_cancel_remove_all(ctx, tctx, &ctx->waitid_list, cancel_all, __io_waitid_cancel); } static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); if (!atomic_sub_return(1, &iw->refs)) return false; io_waitid_remove_wq(req); /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); return true; } static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw) { struct io_kiocb *req = tw_req.req; struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; io_tw_lock(ctx, tw); ret = __do_wait(&iwa->wo); /* * If we get -ERESTARTSYS here, we need to re-arm and check again * to ensure we get another callback. If the retry works, then we can * just remove ourselves from the waitqueue again and finish the * request. */ if (unlikely(ret == -ERESTARTSYS)) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); /* Don't retry if cancel found it meanwhile */ ret = -ECANCELED; if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) { iw->head = &current->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); if (ret == -ERESTARTSYS) { /* retry armed, drop our ref */ io_waitid_drop_issue_ref(req); return; } /* fall through to complete, will kill waitqueue */ } } io_waitid_complete(req, ret); io_req_task_complete(tw_req, tw); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo); struct io_kiocb *req = iwa->req; struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct task_struct *p = key; if (!pid_child_should_wake(wo, p)) return 0; list_del_init(&wait->entry); iw->head = NULL; /* cancel is in progress */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return 1; req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); return 1; } int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa; if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) return -EINVAL; iwa = io_uring_alloc_async_data(NULL, req); if (unlikely(!iwa)) return -ENOMEM; iwa->req = req; iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); iw->head = NULL; iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); return 0; } int io_waitid(struct io_kiocb *req, unsigned int issue_flags) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, iw->options, NULL); if (ret) goto done; /* * Mark the request as busy upfront, in case we're racing with the * wakeup. If we are, then we'll notice when we drop this initial * reference again after arming. */ atomic_set(&iw->refs, 1); /* * Cancel must hold the ctx lock, so there's no risk of cancelation * finding us until a) we remain on the list, and b) the lock is * dropped. We only need to worry about racing with the wakeup * callback. */ io_ring_submit_lock(ctx, issue_flags); /* * iw->head is valid under the ring lock, and as long as the request * is on the waitid_list where cancelations may find it. */ iw->head = &current->signal->wait_chldexit; hlist_add_head(&req->hash_node, &ctx->waitid_list); init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); iwa->wo.child_wait.private = req->tctx->task; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); if (ret == -ERESTARTSYS) { /* * Nobody else grabbed a reference, it'll complete when we get * a waitqueue callback, or if someone cancels it. */ if (!io_waitid_drop_issue_ref(req)) { io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } hlist_del_init(&req->hash_node); io_waitid_remove_wq(req); ret = io_waitid_finish(req, ret); io_ring_submit_unlock(ctx, issue_flags); done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_COMPLETE; }
20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2021 Intel Corporation */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include "hci_codec.h" static int hci_codec_list_add(struct list_head *list, struct hci_op_read_local_codec_caps *sent, struct hci_rp_read_local_codec_caps *rp, void *caps, __u32 len) { struct codec_list *entry; entry = kzalloc(sizeof(*entry) + len, GFP_KERNEL); if (!entry) return -ENOMEM; entry->id = sent->id; if (sent->id == 0xFF) { entry->cid = __le16_to_cpu(sent->cid); entry->vid = __le16_to_cpu(sent->vid); } entry->transport = sent->transport; entry->len = len; entry->num_caps = 0; if (rp) { entry->num_caps = rp->num_caps; memcpy(entry->caps, caps, len); } list_add(&entry->list, list); return 0; } void hci_codec_list_clear(struct list_head *codec_list) { struct codec_list *c, *n; list_for_each_entry_safe(c, n, codec_list, list) { list_del(&c->list); kfree(c); } } static void hci_read_codec_capabilities(struct hci_dev *hdev, __u8 transport, struct hci_op_read_local_codec_caps *cmd) { __u8 i; for (i = 0; i < TRANSPORT_TYPE_MAX; i++) { if (transport & BIT(i)) { struct hci_rp_read_local_codec_caps *rp; struct hci_codec_caps *caps; struct sk_buff *skb; __u8 j; __u32 len; cmd->transport = i; /* If Read_Codec_Capabilities command is not supported * then just add codec to the list without caps */ if (!(hdev->commands[45] & 0x08)) { hci_dev_lock(hdev); hci_codec_list_add(&hdev->local_codecs, cmd, NULL, NULL, 0); hci_dev_unlock(hdev); continue; } skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODEC_CAPS, sizeof(*cmd), cmd, 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read codec capabilities (%ld)", PTR_ERR(skb)); continue; } if (skb->len < sizeof(*rp)) goto error; rp = (void *)skb->data; if (rp->status) goto error; if (!rp->num_caps) { len = 0; /* this codec doesn't have capabilities */ goto skip_caps_parse; } skb_pull(skb, sizeof(*rp)); for (j = 0, len = 0; j < rp->num_caps; j++) { caps = (void *)skb->data; if (skb->len < sizeof(*caps)) goto error; if (skb->len < caps->len) goto error; len += sizeof(caps->len) + caps->len; skb_pull(skb, sizeof(caps->len) + caps->len); } skip_caps_parse: hci_dev_lock(hdev); hci_codec_list_add(&hdev->local_codecs, cmd, rp, (__u8 *)rp + sizeof(*rp), len); hci_dev_unlock(hdev); error: kfree_skb(skb); } } } void hci_read_supported_codecs(struct hci_dev *hdev) { struct sk_buff *skb; struct hci_rp_read_local_supported_codecs *rp; struct hci_std_codecs *std_codecs; struct hci_vnd_codecs *vnd_codecs; struct hci_op_read_local_codec_caps caps; __u8 i; skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL, 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", PTR_ERR(skb)); return; } if (skb->len < sizeof(*rp)) goto error; rp = (void *)skb->data; if (rp->status) goto error; skb_pull(skb, sizeof(rp->status)); std_codecs = (void *)skb->data; /* validate codecs length before accessing */ if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)) goto error; /* enumerate codec capabilities of standard codecs */ memset(&caps, 0, sizeof(caps)); for (i = 0; i < std_codecs->num; i++) { caps.id = std_codecs->codec[i]; caps.direction = 0x00; hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps); } skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)); vnd_codecs = (void *)skb->data; /* validate vendor codecs length before accessing */ if (skb->len < flex_array_size(vnd_codecs, codec, vnd_codecs->num) + sizeof(vnd_codecs->num)) goto error; /* enumerate vendor codec capabilities */ for (i = 0; i < vnd_codecs->num; i++) { caps.id = 0xFF; caps.cid = vnd_codecs->codec[i].cid; caps.vid = vnd_codecs->codec[i].vid; caps.direction = 0x00; hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps); } error: kfree_skb(skb); } void hci_read_supported_codecs_v2(struct hci_dev *hdev) { struct sk_buff *skb; struct hci_rp_read_local_supported_codecs_v2 *rp; struct hci_std_codecs_v2 *std_codecs; struct hci_vnd_codecs_v2 *vnd_codecs; struct hci_op_read_local_codec_caps caps; __u8 i; skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS_V2, 0, NULL, 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", PTR_ERR(skb)); return; } if (skb->len < sizeof(*rp)) goto error; rp = (void *)skb->data; if (rp->status) goto error; skb_pull(skb, sizeof(rp->status)); std_codecs = (void *)skb->data; /* check for payload data length before accessing */ if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)) goto error; memset(&caps, 0, sizeof(caps)); for (i = 0; i < std_codecs->num; i++) { caps.id = std_codecs->codec[i].id; hci_read_codec_capabilities(hdev, std_codecs->codec[i].transport, &caps); } skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)); vnd_codecs = (void *)skb->data; /* check for payload data length before accessing */ if (skb->len < flex_array_size(vnd_codecs, codec, vnd_codecs->num) + sizeof(vnd_codecs->num)) goto error; for (i = 0; i < vnd_codecs->num; i++) { caps.id = 0xFF; caps.cid = vnd_codecs->codec[i].cid; caps.vid = vnd_codecs->codec[i].vid; hci_read_codec_capabilities(hdev, vnd_codecs->codec[i].transport, &caps); } error: kfree_skb(skb); }
399 463 228 18 2165 27 25115 87 88 7 23413 868 407 4230 2285 373 6 1 25437 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Berkeley style UIO structures - Alan Cox 1994. */ #ifndef __LINUX_UIO_H #define __LINUX_UIO_H #include <linux/kernel.h> #include <linux/mm_types.h> #include <linux/ucopysize.h> #include <uapi/linux/uio.h> struct page; struct folio_queue; typedef unsigned int __bitwise iov_iter_extraction_t; struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ size_t iov_len; }; enum iter_type { /* iter types */ ITER_UBUF, ITER_IOVEC, ITER_BVEC, ITER_KVEC, ITER_FOLIOQ, ITER_XARRAY, ITER_DISCARD, }; #define ITER_SOURCE 1 // == WRITE #define ITER_DEST 0 // == READ struct iov_iter_state { size_t iov_offset; size_t count; unsigned long nr_segs; }; struct iov_iter { u8 iter_type; bool nofault; bool data_source; size_t iov_offset; /* * Hack alert: overlay ubuf_iovec with iovec + count, so * that the members resolve correctly regardless of the type * of iterator used. This means that you can use: * * &iter->__ubuf_iovec or iter->__iov * * interchangably for the user_backed cases, hence simplifying * some of the cases that need to deal with both. */ union { /* * This really should be a const, but we cannot do that without * also modifying any of the zero-filling iter init functions. * Leave it non-const for now, but it should be treated as such. */ struct iovec __ubuf_iovec; struct { union { /* use iter_iov() to get the current vec */ const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; const struct folio_queue *folioq; struct xarray *xarray; void __user *ubuf; }; size_t count; }; }; union { unsigned long nr_segs; u8 folioq_slot; loff_t xarray_start; }; }; typedef __u16 uio_meta_flags_t; struct uio_meta { uio_meta_flags_t flags; u16 app_tag; u64 seed; struct iov_iter iter; }; static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) return (const struct iovec *) &iter->__ubuf_iovec; return iter->__iov; } #define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) static inline size_t iter_iov_len(const struct iov_iter *i) { if (i->iter_type == ITER_UBUF) return i->count; return iter_iov(i)->iov_len - i->iov_offset; } static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->iter_type; } static inline void iov_iter_save_state(struct iov_iter *iter, struct iov_iter_state *state) { state->iov_offset = iter->iov_offset; state->count = iter->count; state->nr_segs = iter->nr_segs; } static inline bool iter_is_ubuf(const struct iov_iter *i) { return iov_iter_type(i) == ITER_UBUF; } static inline bool iter_is_iovec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_IOVEC; } static inline bool iov_iter_is_kvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_KVEC; } static inline bool iov_iter_is_bvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_BVEC; } static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; } static inline bool iov_iter_is_folioq(const struct iov_iter *i) { return iov_iter_type(i) == ITER_FOLIOQ; } static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; } static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->data_source ? WRITE : READ; } static inline bool user_backed_iter(const struct iov_iter *i) { return iter_is_ubuf(i) || iter_is_iovec(i); } /* * Total number of bytes covered by an iovec. * * NOTE that it is not safe to use this function until all the iovec's * segment lengths have been validated. Because the individual lengths can * overflow a size_t when added together. */ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) { unsigned long seg; size_t ret = 0; for (seg = 0; seg < nr_segs; seg++) ret += iov[seg].iov_len; return ret; } void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_to_iter(&folio->page, offset, bytes, i); } static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_from_iter(&folio->page, offset, bytes, i); } size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, true)) return _copy_to_iter(addr, bytes, i); return 0; } static __always_inline __must_check size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_to_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter_nocache(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter_nocache(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /* * Note, users like pmem that depend on the stricter semantics of * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the * destination is flushed from the cache on return. */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_from_iter_flushcache _copy_from_iter_nocache #endif #ifdef CONFIG_ARCH_HAS_COPY_MC size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_mc_to_iter _copy_to_iter #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, const struct folio_queue *folioq, unsigned int first_slot, unsigned int offset, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); static inline size_t iov_iter_count(const struct iov_iter *i) { return i->count; } /* * Cap the iov_iter by given limit; note that the second argument is * *not* the new size - it's upper limit for such. Passing it a value * greater than the amount of data in iov_iter is fine - it'll just do * nothing in that case. */ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) { /* * count doesn't have to fit in size_t - comparison extends both * operands to u64 here and any value that would be truncated by * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ if (i->count > count) i->count = count; } /* * reexpand a previously truncated iterator; count must be no more than how much * we had shrunk it. */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } static inline int iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes) { size_t shorted = 0; int npages; if (iov_iter_count(i) > max_bytes) { shorted = iov_iter_count(i) - max_bytes; iov_iter_truncate(i, max_bytes); } npages = iov_iter_npages(i, maxpages); if (shorted) iov_iter_reexpand(i, iov_iter_count(i) + shorted); return npages; } struct iovec *iovec_from_user(const struct iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat); ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i); ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, void __user *buf, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_UBUF, .data_source = direction, .ubuf = buf, .count = count, .nr_segs = 1 }; } /* Flags for iov_iter_get/extract_pages*() */ /* Allow P2PDMA on the extracted pages */ #define ITER_ALLOW_P2PDMA ((__force iov_iter_extraction_t)0x01) ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0); /** * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained * @iter: The iterator * * Examine the iterator and indicate by returning true or false as to how, if * at all, pages extracted from the iterator will be retained by the extraction * function. * * %true indicates that the pages will have a pin placed in them that the * caller must unpin. This is must be done for DMA/async DIO to force fork() * to forcibly copy a page for the child (the parent must retain the original * page). * * %false indicates that no measures are taken and that it's up to the caller * to retain the pages. */ static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter) { return user_backed_iter(iter); } struct sg_table; ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags); #endif
19 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 // SPDX-License-Identifier: GPL-2.0-or-later /* * mount.c - operations for initializing and mounting configfs. * * Based on sysfs: * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/configfs.h> #include "configfs_internal.h" /* Random magic number */ #define CONFIGFS_MAGIC 0x62656570 static struct vfsmount *configfs_mount = NULL; struct kmem_cache *configfs_dir_cachep; static int configfs_mnt_count = 0; static void configfs_free_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); free_inode_nonrcu(inode); } static const struct super_operations configfs_ops = { .statfs = simple_statfs, .drop_inode = inode_just_drop, .free_inode = configfs_free_inode, }; static struct config_group configfs_root_group = { .cg_item = { .ci_namebuf = "root", .ci_name = configfs_root_group.cg_item.ci_namebuf, }, }; int configfs_is_root(struct config_item *item) { return item == &configfs_root_group.cg_item; } static struct configfs_dirent configfs_root = { .s_sibling = LIST_HEAD_INIT(configfs_root.s_sibling), .s_children = LIST_HEAD_INIT(configfs_root.s_children), .s_element = &configfs_root_group.cg_item, .s_type = CONFIGFS_ROOT, .s_iattr = NULL, }; static int configfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct dentry *root; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = CONFIGFS_MAGIC; sb->s_op = &configfs_ops; sb->s_time_gran = 1; inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, &configfs_root, sb); if (inode) { inode->i_op = &configfs_root_inode_operations; inode->i_fop = &configfs_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); } else { pr_debug("could not get root inode\n"); return -ENOMEM; } root = d_make_root(inode); if (!root) { pr_debug("%s: could not get root dentry!\n",__func__); return -ENOMEM; } config_group_init(&configfs_root_group); configfs_root_group.cg_item.ci_dentry = root; root->d_fsdata = &configfs_root; sb->s_root = root; set_default_d_op(sb, &configfs_dentry_ops); /* the rest get that */ sb->s_d_flags |= DCACHE_DONTCACHE; return 0; } static int configfs_get_tree(struct fs_context *fc) { return get_tree_single(fc, configfs_fill_super); } static const struct fs_context_operations configfs_context_ops = { .get_tree = configfs_get_tree, }; static int configfs_init_fs_context(struct fs_context *fc) { fc->ops = &configfs_context_ops; return 0; } static struct file_system_type configfs_fs_type = { .owner = THIS_MODULE, .name = "configfs", .init_fs_context = configfs_init_fs_context, .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("configfs"); struct dentry *configfs_pin_fs(void) { int err = simple_pin_fs(&configfs_fs_type, &configfs_mount, &configfs_mnt_count); return err ? ERR_PTR(err) : configfs_mount->mnt_root; } void configfs_release_fs(void) { simple_release_fs(&configfs_mount, &configfs_mnt_count); } static int __init configfs_init(void) { int err = -ENOMEM; configfs_dir_cachep = kmem_cache_create("configfs_dir_cache", sizeof(struct configfs_dirent), 0, 0, NULL); if (!configfs_dir_cachep) goto out; err = sysfs_create_mount_point(kernel_kobj, "config"); if (err) goto out2; err = register_filesystem(&configfs_fs_type); if (err) goto out3; return 0; out3: pr_err("Unable to register filesystem!\n"); sysfs_remove_mount_point(kernel_kobj, "config"); out2: kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; out: return err; } static void __exit configfs_exit(void) { unregister_filesystem(&configfs_fs_type); sysfs_remove_mount_point(kernel_kobj, "config"); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; } MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); MODULE_VERSION("0.0.2"); MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); core_initcall(configfs_init); module_exit(configfs_exit);
234 272 291 291 272 60 234 212 64 26 26 7027 7026 7027 551 553 552 469 113 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 // SPDX-License-Identifier: GPL-2.0 /* * event tracer * * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * * - Added format output of fields of the trace point. * This was based off of work by Tom Zanussi <tzanussi@gmail.com>. * */ #define pr_fmt(fmt) fmt #include <linux/workqueue.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/tracefs.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/sort.h> #include <linux/slab.h> #include <linux/delay.h> #include <trace/events/sched.h> #include <trace/syscall.h> #include <asm/setup.h> #include "trace_output.h" #undef TRACE_SYSTEM #define TRACE_SYSTEM "TRACE_SYSTEM" DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); static bool eventdir_initialized; static LIST_HEAD(module_strings); struct module_string { struct list_head next; struct module *module; char *str; }; #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) static struct kmem_cache *field_cachep; static struct kmem_cache *file_cachep; static inline int system_refcount(struct event_subsystem *system) { return system->ref_count; } static int system_refcount_inc(struct event_subsystem *system) { return system->ref_count++; } static int system_refcount_dec(struct event_subsystem *system) { return --system->ref_count; } /* Double loops, do not use break, only goto's work */ #define do_for_each_event_file(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ list_for_each_entry(file, &tr->events, list) #define do_for_each_event_file_safe(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ struct trace_event_file *___n; \ list_for_each_entry_safe(file, ___n, &tr->events, list) #define while_for_each_event_file() \ } static struct ftrace_event_field * __find_event_field(struct list_head *head, const char *name) { struct ftrace_event_field *field; list_for_each_entry(field, head, link) { if (!strcmp(field->name, name)) return field; } return NULL; } struct ftrace_event_field * trace_find_event_field(struct trace_event_call *call, char *name) { struct ftrace_event_field *field; struct list_head *head; head = trace_get_fields(call); field = __find_event_field(head, name); if (field) return field; field = __find_event_field(&ftrace_generic_fields, name); if (field) return field; return __find_event_field(&ftrace_common_fields, name); } static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, int is_signed, int filter_type, int len, int need_test) { struct ftrace_event_field *field; field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) return -ENOMEM; field->name = name; field->type = type; if (filter_type == FILTER_OTHER) field->filter_type = filter_assign_type(type); else field->filter_type = filter_type; field->offset = offset; field->size = size; field->is_signed = is_signed; field->needs_test = need_test; field->len = len; list_add(&field->link, head); return 0; } int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { struct list_head *head; if (WARN_ON(!call->class)) return 0; head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, is_signed, filter_type, 0, 0); } EXPORT_SYMBOL_GPL(trace_define_field); static int trace_define_field_ext(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type, int len, int need_test) { struct list_head *head; if (WARN_ON(!call->class)) return 0; head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, is_signed, filter_type, len, need_test); } #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ filter_type, 0, 0); \ if (ret) \ return ret; #define __common_field(type, item) \ ret = __trace_define_field(&ftrace_common_fields, #type, \ "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ is_signed_type(type), FILTER_OTHER, \ 0, 0); \ if (ret) \ return ret; static int trace_define_generic_fields(void) { int ret; __generic_field(int, CPU, FILTER_CPU); __generic_field(int, cpu, FILTER_CPU); __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); __generic_field(char *, stacktrace, FILTER_STACKTRACE); __generic_field(char *, STACKTRACE, FILTER_STACKTRACE); return ret; } static int trace_define_common_fields(void) { int ret; struct trace_entry ent; __common_field(unsigned short, type); __common_field(unsigned char, flags); /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); return ret; } static void trace_destroy_fields(struct trace_event_call *call) { struct ftrace_event_field *field, *next; struct list_head *head; head = trace_get_fields(call); list_for_each_entry_safe(field, next, head, link) { list_del(&field->link); kmem_cache_free(field_cachep, field); } } /* * run-time version of trace_event_get_offsets_<call>() that returns the last * accessible offset of trace fields excluding __dynamic_array bytes */ int trace_event_get_offsets(struct trace_event_call *call) { struct ftrace_event_field *tail; struct list_head *head; head = trace_get_fields(call); /* * head->next points to the last field with the largest offset, * since it was added last by trace_define_field() */ tail = list_first_entry(head, struct ftrace_event_field, link); return tail->offset + tail->size; } static struct trace_event_fields *find_event_field(const char *fmt, struct trace_event_call *call) { struct trace_event_fields *field = call->class->fields_array; const char *p = fmt; int len; if (!(len = str_has_prefix(fmt, "REC->"))) return NULL; fmt += len; for (p = fmt; *p; p++) { if (!isalnum(*p) && *p != '_') break; } len = p - fmt; for (; field->type; field++) { if (strncmp(field->name, fmt, len) || field->name[len]) continue; return field; } return NULL; } /* * Check if the referenced field is an array and return true, * as arrays are OK to dereference. */ static bool test_field(const char *fmt, struct trace_event_call *call) { struct trace_event_fields *field; field = find_event_field(fmt, call); if (!field) return false; /* This is an array and is OK to dereference. */ return strchr(field->type, '[') != NULL; } /* Look for a string within an argument */ static bool find_print_string(const char *arg, const char *str, const char *end) { const char *r; r = strstr(arg, str); return r && r < end; } /* Return true if the argument pointer is safe */ static bool process_pointer(const char *fmt, int len, struct trace_event_call *call) { const char *r, *e, *a; e = fmt + len; /* Find the REC-> in the argument */ r = strstr(fmt, "REC->"); if (r && r < e) { /* * Addresses of events on the buffer, or an array on the buffer is * OK to dereference. There's ways to fool this, but * this is to catch common mistakes, not malicious code. */ a = strchr(fmt, '&'); if ((a && (a < r)) || test_field(r, call)) return true; } else if (find_print_string(fmt, "__get_dynamic_array(", e)) { return true; } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) { return true; } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) { return true; } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) { return true; } else if (find_print_string(fmt, "__get_sockaddr(", e)) { return true; } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) { return true; } return false; } /* Return true if the string is safe */ static bool process_string(const char *fmt, int len, struct trace_event_call *call) { struct trace_event_fields *field; const char *r, *e, *s; e = fmt + len; /* * There are several helper functions that return strings. * If the argument contains a function, then assume its field is valid. * It is considered that the argument has a function if it has: * alphanumeric or '_' before a parenthesis. */ s = fmt; do { r = strstr(s, "("); if (!r || r >= e) break; for (int i = 1; r - i >= s; i++) { char ch = *(r - i); if (isspace(ch)) continue; if (isalnum(ch) || ch == '_') return true; /* Anything else, this isn't a function */ break; } /* A function could be wrapped in parenthesis, try the next one */ s = r + 1; } while (s < e); /* * Check for arrays. If the argument has: foo[REC->val] * then it is very likely that foo is an array of strings * that are safe to use. */ r = strstr(s, "["); if (r && r < e) { r = strstr(r, "REC->"); if (r && r < e) return true; } /* * If there's any strings in the argument consider this arg OK as it * could be: REC->field ? "foo" : "bar" and we don't want to get into * verifying that logic here. */ if (find_print_string(fmt, "\"", e)) return true; /* Dereferenced strings are also valid like any other pointer */ if (process_pointer(fmt, len, call)) return true; /* Make sure the field is found */ field = find_event_field(fmt, call); if (!field) return false; /* Test this field's string before printing the event */ call->flags |= TRACE_EVENT_FL_TEST_STR; field->needs_test = 1; return true; } static void handle_dereference_arg(const char *arg_str, u64 string_flags, int len, u64 *dereference_flags, int arg, struct trace_event_call *call) { if (string_flags & (1ULL << arg)) { if (process_string(arg_str, len, call)) *dereference_flags &= ~(1ULL << arg); } else if (process_pointer(arg_str, len, call)) *dereference_flags &= ~(1ULL << arg); else pr_warn("TRACE EVENT ERROR: Bad dereference argument: '%.*s'\n", len, arg_str); } /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and * much later referenced after the pointer was freed. Dereferencing * pointers are OK, if it is dereferenced into the event itself. */ static void test_event_printk(struct trace_event_call *call) { u64 dereference_flags = 0; u64 string_flags = 0; bool first = true; const char *fmt; int parens = 0; char in_quote = 0; int start_arg = 0; int arg = 0; int i, e; fmt = call->print_fmt; if (!fmt) return; for (i = 0; fmt[i]; i++) { switch (fmt[i]) { case '\\': i++; if (!fmt[i]) return; continue; case '"': case '\'': /* * The print fmt starts with a string that * is processed first to find %p* usage, * then after the first string, the print fmt * contains arguments that are used to check * if the dereferenced %p* usage is safe. */ if (first) { if (fmt[i] == '\'') continue; if (in_quote) { arg = 0; first = false; /* * If there was no %p* uses * the fmt is OK. */ if (!dereference_flags) return; } } if (in_quote) { if (in_quote == fmt[i]) in_quote = 0; } else { in_quote = fmt[i]; } continue; case '%': if (!first || !in_quote) continue; i++; if (!fmt[i]) return; switch (fmt[i]) { case '%': continue; case 'p': do_pointer: /* Find dereferencing fields */ switch (fmt[i + 1]) { case 'B': case 'R': case 'r': case 'b': case 'M': case 'm': case 'I': case 'i': case 'E': case 'U': case 'V': case 'N': case 'a': case 'd': case 'D': case 'g': case 't': case 'C': case 'O': case 'f': if (WARN_ONCE(arg == 63, "Too many args for event: %s", trace_event_name(call))) return; dereference_flags |= 1ULL << arg; } break; default: { bool star = false; int j; /* Increment arg if %*s exists. */ for (j = 0; fmt[i + j]; j++) { if (isdigit(fmt[i + j]) || fmt[i + j] == '.') continue; if (fmt[i + j] == '*') { star = true; /* Handle %*pbl case */ if (!j && fmt[i + 1] == 'p') { arg++; i++; goto do_pointer; } continue; } if ((fmt[i + j] == 's')) { if (star) arg++; if (WARN_ONCE(arg == 63, "Too many args for event: %s", trace_event_name(call))) return; dereference_flags |= 1ULL << arg; string_flags |= 1ULL << arg; } break; } break; } /* default */ } /* switch */ arg++; continue; case '(': if (in_quote) continue; parens++; continue; case ')': if (in_quote) continue; parens--; if (WARN_ONCE(parens < 0, "Paren mismatch for event: %s\narg='%s'\n%*s", trace_event_name(call), fmt + start_arg, (i - start_arg) + 5, "^")) return; continue; case ',': if (in_quote || parens) continue; e = i; i++; while (isspace(fmt[i])) i++; /* * If start_arg is zero, then this is the start of the * first argument. The processing of the argument happens * when the end of the argument is found, as it needs to * handle parenthesis and such. */ if (!start_arg) { start_arg = i; /* Balance out the i++ in the for loop */ i--; continue; } if (dereference_flags & (1ULL << arg)) { handle_dereference_arg(fmt + start_arg, string_flags, e - start_arg, &dereference_flags, arg, call); } start_arg = i; arg++; /* Balance out the i++ in the for loop */ i--; } } if (dereference_flags & (1ULL << arg)) { handle_dereference_arg(fmt + start_arg, string_flags, i - start_arg, &dereference_flags, arg, call); } /* * If you triggered the below warning, the trace event reported * uses an unsafe dereference pointer %p*. As the data stored * at the trace event time may no longer exist when the trace * event is printed, dereferencing to the original source is * unsafe. The source of the dereference must be copied into the * event itself, and the dereference must access the copy instead. */ if (WARN_ON_ONCE(dereference_flags)) { arg = 1; while (!(dereference_flags & 1)) { dereference_flags >>= 1; arg++; } pr_warn("event %s has unsafe dereference of argument %d\n", trace_event_name(call), arg); pr_warn("print_fmt: %s\n", fmt); } } int trace_event_raw_init(struct trace_event_call *call) { int id; id = register_trace_event(&call->event); if (!id) return -ENODEV; test_event_printk(call); return 0; } EXPORT_SYMBOL_GPL(trace_event_raw_init); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) { struct trace_array *tr = trace_file->tr; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; pid_list = rcu_dereference_raw(tr->filtered_pids); no_pid_list = rcu_dereference_raw(tr->filtered_no_pids); if (!pid_list && !no_pid_list) return false; /* * This is recorded at every sched_switch for this task. * Thus, even if the task migrates the ignore value will be the same. */ return this_cpu_read(tr->array_buffer.data->ignore_pid) != 0; } EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len) { struct trace_event_call *event_call = trace_file->event_call; if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) && trace_event_ignore_this_pid(trace_file)) return NULL; /* * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables * preemption (adding one to the preempt_count). Since we are * interested in the preempt_count at the time the tracepoint was * hit, we need to subtract one to offset the increment. */ fbuffer->trace_ctx = tracing_gen_ctx_dec(); fbuffer->trace_file = trace_file; fbuffer->event = trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file, event_call->event.type, len, fbuffer->trace_ctx); if (!fbuffer->event) return NULL; fbuffer->regs = NULL; fbuffer->entry = ring_buffer_event_data(fbuffer->event); return fbuffer->entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_reserve); int trace_event_reg(struct trace_event_call *call, enum trace_reg type, void *data) { struct trace_event_file *file = data; WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); switch (type) { case TRACE_REG_REGISTER: return tracepoint_probe_register(call->tp, call->class->probe, file); case TRACE_REG_UNREGISTER: tracepoint_probe_unregister(call->tp, call->class->probe, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: if (!call->class->perf_probe) return -ENODEV; return tracepoint_probe_register(call->tp, call->class->perf_probe, call); case TRACE_REG_PERF_UNREGISTER: tracepoint_probe_unregister(call->tp, call->class->perf_probe, call); return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: case TRACE_REG_PERF_DEL: return 0; #endif } return 0; } EXPORT_SYMBOL_GPL(trace_event_reg); void trace_event_enable_cmd_record(bool enable) { struct trace_event_file *file; struct trace_array *tr; lockdep_assert_held(&event_mutex); do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; if (enable) { tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } else { tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } } while_for_each_event_file(); } void trace_event_enable_tgid_record(bool enable) { struct trace_event_file *file; struct trace_array *tr; lockdep_assert_held(&event_mutex); do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; if (enable) { tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } else { tracing_stop_tgid_record(); clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } } while_for_each_event_file(); } static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; bool soft_mode = atomic_read(&file->sm_ref) != 0; int ret = 0; int disable; switch (enable) { case 0: /* * When soft_disable is set and enable is cleared, the sm_ref * reference counter is decremented. If it reaches 0, we want * to clear the SOFT_DISABLED flag but leave the event in the * state that it was. That is, if the event was enabled and * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED * is set we do not want the event to be enabled before we * clear the bit. * * When soft_disable is not set but the soft_mode is, * we do nothing. Do not disable the tracepoint, otherwise * "soft enable"s (clearing the SOFT_DISABLED bit) won't work. */ if (soft_disable) { if (atomic_dec_return(&file->sm_ref) > 0) break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; soft_mode = false; /* Disable use of trace_buffered_event */ trace_buffered_event_disable(); } else disable = !soft_mode; if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); if (file->flags & EVENT_FILE_FL_RECORDED_CMD) { tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { tracing_stop_tgid_record(); clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } ret = call->class->reg(call, TRACE_REG_UNREGISTER, file); WARN_ON_ONCE(ret); } /* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */ if (soft_mode) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: /* * When soft_disable is set and enable is set, we want to * register the tracepoint for the event, but leave the event * as is. That means, if the event was already enabled, we do * nothing. If the event is disabled, we set SOFT_DISABLED * before enabling the event tracepoint, so it still seems * to be disabled. */ if (!soft_disable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else { if (atomic_inc_return(&file->sm_ref) > 1) break; /* Enable use of trace_buffered_event */ trace_buffered_event_enable(); } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { bool cmd = false, tgid = false; /* Keep the event disabled, when going to soft mode. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); if (tr->trace_flags & TRACE_ITER(RECORD_CMD)) { cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) { tgid = true; tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { if (cmd) tracing_stop_cmdline_record(); if (tgid) tracing_stop_tgid_record(); pr_info("event trace: Could not enable event " "%s\n", trace_event_name(call)); break; } set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); /* WAS_ENABLED gets set but never cleared. */ set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags); } break; } return ret; } int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { return __ftrace_event_enable_disable(file, enable, soft_disable); } static int ftrace_event_enable_disable(struct trace_event_file *file, int enable) { return __ftrace_event_enable_disable(file, enable, 0); } #ifdef CONFIG_MODULES struct event_mod_load { struct list_head list; char *module; char *match; char *system; char *event; }; static void free_event_mod(struct event_mod_load *event_mod) { list_del(&event_mod->list); kfree(event_mod->module); kfree(event_mod->match); kfree(event_mod->system); kfree(event_mod->event); kfree(event_mod); } static void clear_mod_events(struct trace_array *tr) { struct event_mod_load *event_mod, *n; list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { free_event_mod(event_mod); } } static int remove_cache_mod(struct trace_array *tr, const char *mod, const char *match, const char *system, const char *event) { struct event_mod_load *event_mod, *n; int ret = -EINVAL; list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { if (strcmp(event_mod->module, mod) != 0) continue; if (match && strcmp(event_mod->match, match) != 0) continue; if (system && (!event_mod->system || strcmp(event_mod->system, system) != 0)) continue; if (event && (!event_mod->event || strcmp(event_mod->event, event) != 0)) continue; free_event_mod(event_mod); ret = 0; } return ret; } static int cache_mod(struct trace_array *tr, const char *mod, int set, const char *match, const char *system, const char *event) { struct event_mod_load *event_mod; /* If the module exists, then this just failed to find an event */ if (module_exists(mod)) return -EINVAL; /* See if this is to remove a cached filter */ if (!set) return remove_cache_mod(tr, mod, match, system, event); event_mod = kzalloc(sizeof(*event_mod), GFP_KERNEL); if (!event_mod) return -ENOMEM; INIT_LIST_HEAD(&event_mod->list); event_mod->module = kstrdup(mod, GFP_KERNEL); if (!event_mod->module) goto out_free; if (match) { event_mod->match = kstrdup(match, GFP_KERNEL); if (!event_mod->match) goto out_free; } if (system) { event_mod->system = kstrdup(system, GFP_KERNEL); if (!event_mod->system) goto out_free; } if (event) { event_mod->event = kstrdup(event, GFP_KERNEL); if (!event_mod->event) goto out_free; } list_add(&event_mod->list, &tr->mod_events); return 0; out_free: free_event_mod(event_mod); return -ENOMEM; } #else /* CONFIG_MODULES */ static inline void clear_mod_events(struct trace_array *tr) { } static int cache_mod(struct trace_array *tr, const char *mod, int set, const char *match, const char *system, const char *event) { return -EINVAL; } #endif static void ftrace_clear_events(struct trace_array *tr) { struct trace_event_file *file; mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { ftrace_event_enable_disable(file, 0); } clear_mod_events(tr); mutex_unlock(&event_mutex); } static void event_filter_pid_sched_process_exit(void *data, struct task_struct *task) { struct trace_pid_list *pid_list; struct trace_array *tr = data; pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); pid_list = rcu_dereference_raw(tr->filtered_no_pids); trace_filter_add_remove_task(pid_list, NULL, task); } static void event_filter_pid_sched_process_fork(void *data, struct task_struct *self, struct task_struct *task) { struct trace_pid_list *pid_list; struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); trace_filter_add_remove_task(pid_list, self, task); pid_list = rcu_dereference_sched(tr->filtered_no_pids); trace_filter_add_remove_task(pid_list, self, task); } void trace_event_follow_fork(struct trace_array *tr, bool enable) { if (enable) { register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, tr, INT_MIN); register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit, tr, INT_MAX); } else { unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, tr); unregister_trace_sched_process_free(event_filter_pid_sched_process_exit, tr); } } static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; bool ret; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); /* * Sched switch is funny, as we only want to ignore it * in the notrace case if both prev and next should be ignored. */ ret = trace_ignore_this_task(NULL, no_pid_list, prev) && trace_ignore_this_task(NULL, no_pid_list, next); this_cpu_write(tr->array_buffer.data->ignore_pid, ret || (trace_ignore_this_task(pid_list, NULL, prev) && trace_ignore_this_task(pid_list, NULL, next))); } static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, next)); } static void event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; /* Nothing to do if we are already tracing */ if (!this_cpu_read(tr->array_buffer.data->ignore_pid)) return; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, task)); } static void event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; /* Nothing to do if we are not tracing */ if (this_cpu_read(tr->array_buffer.data->ignore_pid)) return; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); /* Set tracing if current is enabled */ this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, current)); } static void unregister_pid_events(struct trace_array *tr) { unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr); unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr); unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr); unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr); unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr); } static void __ftrace_clear_event_pids(struct trace_array *tr, int type) { struct trace_pid_list *pid_list; struct trace_pid_list *no_pid_list; struct trace_event_file *file; int cpu; pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); /* Make sure there's something to do */ if (!pid_type_enabled(type, pid_list, no_pid_list)) return; if (!still_need_pid_events(type, pid_list, no_pid_list)) { unregister_pid_events(tr); list_for_each_entry(file, &tr->events, list) { clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); } for_each_possible_cpu(cpu) per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false; } if (type & TRACE_PIDS) rcu_assign_pointer(tr->filtered_pids, NULL); if (type & TRACE_NO_PIDS) rcu_assign_pointer(tr->filtered_no_pids, NULL); /* Wait till all users are no longer using pid filtering */ tracepoint_synchronize_unregister(); if ((type & TRACE_PIDS) && pid_list) trace_pid_list_free(pid_list); if ((type & TRACE_NO_PIDS) && no_pid_list) trace_pid_list_free(no_pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr, int type) { mutex_lock(&event_mutex); __ftrace_clear_event_pids(tr, type); mutex_unlock(&event_mutex); } static void __put_system(struct event_subsystem *system) { struct event_filter *filter = system->filter; WARN_ON_ONCE(system_refcount(system) == 0); if (system_refcount_dec(system)) return; list_del(&system->list); if (filter) { kfree(filter->filter_string); kfree(filter); } kfree_const(system->name); kfree(system); } static void __get_system(struct event_subsystem *system) { WARN_ON_ONCE(system_refcount(system) == 0); system_refcount_inc(system); } static void __get_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); dir->ref_count++; __get_system(dir->subsystem); } static void __put_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); /* If the subsystem is about to be freed, the dir must be too */ WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); __put_system(dir->subsystem); if (!--dir->ref_count) kfree(dir); } static void put_system(struct trace_subsystem_dir *dir) { mutex_lock(&event_mutex); __put_system_dir(dir); mutex_unlock(&event_mutex); } static void remove_subsystem(struct trace_subsystem_dir *dir) { if (!dir) return; if (!--dir->nr_events) { eventfs_remove_dir(dir->ei); list_del(&dir->list); __put_system_dir(dir); } } void event_file_get(struct trace_event_file *file) { refcount_inc(&file->ref); } void event_file_put(struct trace_event_file *file) { if (WARN_ON_ONCE(!refcount_read(&file->ref))) { if (file->flags & EVENT_FILE_FL_FREED) kmem_cache_free(file_cachep, file); return; } if (refcount_dec_and_test(&file->ref)) { /* Count should only go to zero when it is freed */ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) return; kmem_cache_free(file_cachep, file); } } static void remove_event_file_dir(struct trace_event_file *file) { eventfs_remove_dir(file->ei); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); file->flags |= EVENT_FILE_FL_FREED; event_file_put(file); } /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ static int __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, const char *sub, const char *event, int set, const char *mod) { struct trace_event_file *file; struct trace_event_call *call; char *module __free(kfree) = NULL; const char *name; int ret = -EINVAL; int eret = 0; if (mod) { char *p; module = kstrdup(mod, GFP_KERNEL); if (!module) return -ENOMEM; /* Replace all '-' with '_' as that's what modules do */ for (p = strchr(module, '-'); p; p = strchr(p + 1, '-')) *p = '_'; } list_for_each_entry(file, &tr->events, list) { call = file->event_call; /* If a module is specified, skip events that are not that module */ if (module && (!call->module || strcmp(module_name(call->module), module))) continue; name = trace_event_name(call); if (!name || !call->class || !call->class->reg) continue; if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue; if (match && strcmp(match, name) != 0 && strcmp(match, call->class->system) != 0) continue; if (sub && strcmp(sub, call->class->system) != 0) continue; if (event && strcmp(event, name) != 0) continue; ret = ftrace_event_enable_disable(file, set); /* * Save the first error and return that. Some events * may still have been enabled, but let the user * know that something went wrong. */ if (ret && !eret) eret = ret; ret = eret; } /* * If this is a module setting and nothing was found, * check if the module was loaded. If it wasn't cache it. */ if (module && ret == -EINVAL && !eret) ret = cache_mod(tr, module, set, match, sub, event); return ret; } static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, const char *sub, const char *event, int set, const char *mod) { int ret; mutex_lock(&event_mutex); ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod); mutex_unlock(&event_mutex); return ret; } int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match, *mod; int ret; if (!tr) return -ENOENT; /* Modules events can be appended with :mod:<module> */ mod = strstr(buf, ":mod:"); if (mod) { *mod = '\0'; /* move to the module name */ mod += 5; } /* * The buf format can be <subsystem>:<event-name> * *:<event-name> means any event by that name. * :<event-name> is the same. * * <subsystem>:* means all events in that subsystem * <subsystem>: means the same. * * <name> (no ':') means all events in a subsystem with * the name <name> or any event that matches <name> */ match = strsep(&buf, ":"); if (buf) { sub = match; event = buf; match = NULL; if (!strlen(sub) || strcmp(sub, "*") == 0) sub = NULL; if (!strlen(event) || strcmp(event, "*") == 0) event = NULL; } else if (mod) { /* Allow wildcard for no length or star */ if (!strlen(match) || strcmp(match, "*") == 0) match = NULL; } ret = __ftrace_set_clr_event(tr, match, sub, event, set, mod); /* Put back the colon to allow this to be called again */ if (buf) *(buf - 1) = ':'; return ret; } /** * trace_set_clr_event - enable or disable an event * @system: system name to match (NULL for any system) * @event: event name to match (NULL for all events, within system) * @set: 1 to enable, 0 to disable * * This is a way for other parts of the kernel to enable or disable * event recording. * * Returns 0 on success, -EINVAL if the parameters do not match any * registered events. */ int trace_set_clr_event(const char *system, const char *event, int set) { struct trace_array *tr = top_trace_array(); if (!tr) return -ENODEV; return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL); } EXPORT_SYMBOL_GPL(trace_set_clr_event); /** * trace_array_set_clr_event - enable or disable an event for a trace array. * @tr: concerned trace array. * @system: system name to match (NULL for any system) * @event: event name to match (NULL for all events, within system) * @enable: true to enable, false to disable * * This is a way for other parts of the kernel to enable or disable * event recording. * * Returns 0 on success, -EINVAL if the parameters do not match any * registered events. */ int trace_array_set_clr_event(struct trace_array *tr, const char *system, const char *event, bool enable) { int set; if (!tr) return -ENOENT; set = (enable == true) ? 1 : 0; return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL); } EXPORT_SYMBOL_GPL(trace_array_set_clr_event); /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 static ssize_t ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; struct seq_file *m = file->private_data; struct trace_array *tr = m->private; ssize_t read, ret; if (!cnt) return 0; ret = tracing_update_buffers(tr); if (ret < 0) return ret; if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) return -ENOMEM; read = trace_get_user(&parser, ubuf, cnt, ppos); if (read >= 0 && trace_parser_loaded((&parser))) { int set = 1; if (*parser.buffer == '!') set = 0; ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; } ret = read; out_put: trace_parser_put(&parser); return ret; } static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_event_file *file = v; struct trace_event_call *call; struct trace_array *tr = m->private; (*pos)++; list_for_each_entry_continue(file, &tr->events, list) { call = file->event_call; /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->class && call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } return NULL; } static void *t_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *file; struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { file = t_next(m, file, &l); if (!file) break; } return file; } enum set_event_iter_type { SET_EVENT_FILE, SET_EVENT_MOD, }; struct set_event_iter { enum set_event_iter_type type; union { struct trace_event_file *file; struct event_mod_load *event_mod; }; }; static void * s_next(struct seq_file *m, void *v, loff_t *pos) { struct set_event_iter *iter = v; struct trace_event_file *file; struct trace_array *tr = m->private; (*pos)++; if (iter->type == SET_EVENT_FILE) { file = iter->file; list_for_each_entry_continue(file, &tr->events, list) { if (file->flags & EVENT_FILE_FL_ENABLED) { iter->file = file; return iter; } } #ifdef CONFIG_MODULES iter->type = SET_EVENT_MOD; iter->event_mod = list_entry(&tr->mod_events, struct event_mod_load, list); #endif } #ifdef CONFIG_MODULES list_for_each_entry_continue(iter->event_mod, &tr->mod_events, list) return iter; #endif /* * The iter is allocated in s_start() and passed via the 'v' * parameter. To stop the iterator, NULL must be returned. But * the return value is what the 'v' parameter in s_stop() receives * and frees. Free iter here as it will no longer be used. */ kfree(iter); return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_array *tr = m->private; struct set_event_iter *iter; loff_t l; iter = kzalloc(sizeof(*iter), GFP_KERNEL); mutex_lock(&event_mutex); if (!iter) return NULL; iter->type = SET_EVENT_FILE; iter->file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { iter = s_next(m, iter, &l); if (!iter) break; } return iter; } static int t_show(struct seq_file *m, void *v) { struct trace_event_file *file = v; struct trace_event_call *call = file->event_call; if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); seq_printf(m, "%s\n", trace_event_name(call)); return 0; } static void t_stop(struct seq_file *m, void *p) { mutex_unlock(&event_mutex); } #ifdef CONFIG_MODULES static int s_show(struct seq_file *m, void *v) { struct set_event_iter *iter = v; const char *system; const char *event; if (iter->type == SET_EVENT_FILE) return t_show(m, iter->file); /* When match is set, system and event are not */ if (iter->event_mod->match) { seq_printf(m, "%s:mod:%s\n", iter->event_mod->match, iter->event_mod->module); return 0; } system = iter->event_mod->system ? : "*"; event = iter->event_mod->event ? : "*"; seq_printf(m, "%s:%s:mod:%s\n", system, event, iter->event_mod->module); return 0; } #else /* CONFIG_MODULES */ static int s_show(struct seq_file *m, void *v) { struct set_event_iter *iter = v; return t_show(m, iter->file); } #endif static void s_stop(struct seq_file *m, void *v) { kfree(v); t_stop(m, NULL); } static void * __next(struct seq_file *m, void *v, loff_t *pos, int type) { struct trace_array *tr = m->private; struct trace_pid_list *pid_list; if (type == TRACE_PIDS) pid_list = rcu_dereference_sched(tr->filtered_pids); else pid_list = rcu_dereference_sched(tr->filtered_no_pids); return trace_pid_next(pid_list, v, pos); } static void * p_next(struct seq_file *m, void *v, loff_t *pos) { return __next(m, v, pos, TRACE_PIDS); } static void * np_next(struct seq_file *m, void *v, loff_t *pos) { return __next(m, v, pos, TRACE_NO_PIDS); } static void *__start(struct seq_file *m, loff_t *pos, int type) __acquires(RCU) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; /* * Grab the mutex, to keep calls to p_next() having the same * tr->filtered_pids as p_start() has. * If we just passed the tr->filtered_pids around, then RCU would * have been enough, but doing that makes things more complex. */ mutex_lock(&event_mutex); rcu_read_lock_sched(); if (type == TRACE_PIDS) pid_list = rcu_dereference_sched(tr->filtered_pids); else pid_list = rcu_dereference_sched(tr->filtered_no_pids); if (!pid_list) return NULL; return trace_pid_start(pid_list, pos); } static void *p_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { return __start(m, pos, TRACE_PIDS); } static void *np_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { return __start(m, pos, TRACE_NO_PIDS); } static void p_stop(struct seq_file *m, void *p) __releases(RCU) { rcu_read_unlock_sched(); mutex_unlock(&event_mutex); } static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; unsigned long flags; char buf[4] = "0"; mutex_lock(&event_mutex); file = event_file_file(filp); if (likely(file)) flags = file->flags; mutex_unlock(&event_mutex); if (!file) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && !(flags & EVENT_FILE_FL_SOFT_DISABLED)) strcpy(buf, "1"); if (atomic_read(&file->sm_ref) != 0) strcat(buf, "*"); strcat(buf, "\n"); return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); } static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; guard(mutex)(&event_mutex); switch (val) { case 0: case 1: file = event_file_file(filp); if (!file) return -ENODEV; ret = tracing_update_buffers(file->tr); if (ret < 0) return ret; ret = ftrace_event_enable_disable(file, val); if (ret < 0) return ret; break; default: return -EINVAL; } *ppos += cnt; return cnt; } /* * Returns: * 0 : no events exist? * 1 : all events are disabled * 2 : all events are enabled * 3 : some events are enabled and some are enabled */ int trace_events_enabled(struct trace_array *tr, const char *system) { struct trace_event_call *call; struct trace_event_file *file; int set = 0; guard(mutex)(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || !trace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system) != 0) continue; /* * We need to find out if all the events are set * or if all events or cleared, or if we have * a mixture. */ set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED)); /* * If we have a mixture, no need to look further. */ if (set == 3) break; } return set; } static ssize_t system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; struct trace_array *tr = dir->tr; char buf[2]; int set; int ret; set = trace_events_enabled(tr, system ? system->name : NULL); buf[0] = set_to_char[set]; buf[1] = '\n'; ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); return ret; } static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; const char *name = NULL; unsigned long val; ssize_t ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; ret = tracing_update_buffers(dir->tr); if (ret < 0) return ret; if (val != 0 && val != 1) return -EINVAL; /* * Opening of "enable" adds a ref count to system, * so the name is safe to use. */ if (system) name = system->name; ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val, NULL); if (ret) goto out; ret = cnt; out: *ppos += cnt; return ret; } enum { FORMAT_HEADER = 1, FORMAT_FIELD_SEPERATOR = 2, FORMAT_PRINTFMT = 3, }; static void *f_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_event_file *file = event_file_data(m->private); struct trace_event_call *call = file->event_call; struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: node = common_head; break; case FORMAT_FIELD_SEPERATOR: node = head; break; case FORMAT_PRINTFMT: /* all done */ return NULL; } node = node->prev; if (node == common_head) return (void *)FORMAT_FIELD_SEPERATOR; else if (node == head) return (void *)FORMAT_PRINTFMT; else return node; } static int f_show(struct seq_file *m, void *v) { struct trace_event_file *file = event_file_data(m->private); struct trace_event_call *call = file->event_call; struct ftrace_event_field *field; const char *array_descriptor; switch ((unsigned long)v) { case FORMAT_HEADER: seq_printf(m, "name: %s\n", trace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); seq_puts(m, "format:\n"); return 0; case FORMAT_FIELD_SEPERATOR: seq_putc(m, '\n'); return 0; case FORMAT_PRINTFMT: seq_printf(m, "\nprint fmt: %s\n", call->print_fmt); return 0; } field = list_entry(v, struct ftrace_event_field, link); /* * Smartly shows the array type(except dynamic array). * Normal: * field:TYPE VAR * If TYPE := TYPE[LEN], it is shown: * field:TYPE VAR[LEN] */ array_descriptor = strchr(field->type, '['); if (str_has_prefix(field->type, "__data_loc")) array_descriptor = NULL; if (!array_descriptor) seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", field->type, field->name, field->offset, field->size, !!field->is_signed); else if (field->len) seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, field->len, field->offset, field->size, !!field->is_signed); else seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, field->offset, field->size, !!field->is_signed); return 0; } static void *f_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *file; void *p = (void *)FORMAT_HEADER; loff_t l = 0; /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); file = event_file_file(m->private); if (!file) return ERR_PTR(-ENODEV); while (l < *pos && p) p = f_next(m, p, &l); return p; } static void f_stop(struct seq_file *m, void *p) { mutex_unlock(&event_mutex); } static const struct seq_operations trace_format_seq_ops = { .start = f_start, .next = f_next, .stop = f_stop, .show = f_show, }; static int trace_format_open(struct inode *inode, struct file *file) { struct seq_file *m; int ret; /* Do we want to hide event format files on tracefs lockdown? */ ret = seq_open(file, &trace_format_seq_ops); if (ret < 0) return ret; m = file->private_data; m->private = file; return 0; } #ifdef CONFIG_PERF_EVENTS static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { int id = (long)event_file_data(filp); char buf[32]; int len; if (unlikely(!id)) return -ENODEV; len = sprintf(buf, "%d\n", id); return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } #endif static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; struct trace_seq *s; int r = -ENODEV; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); mutex_lock(&event_mutex); file = event_file_file(filp); if (file) print_event_filter(file, s); mutex_unlock(&event_mutex); if (file) r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; char *buf; int err = -ENODEV; if (cnt >= PAGE_SIZE) return -EINVAL; buf = memdup_user_nul(ubuf, cnt); if (IS_ERR(buf)) return PTR_ERR(buf); mutex_lock(&event_mutex); file = event_file_file(filp); if (file) { if (file->flags & EVENT_FILE_FL_FREED) err = -ENODEV; else err = apply_event_filter(file, buf); } mutex_unlock(&event_mutex); kfree(buf); if (err < 0) return err; *ppos += cnt; return cnt; } static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { struct trace_subsystem_dir *dir = NULL, *iter_dir; struct trace_array *tr = NULL, *iter_tr; struct event_subsystem *system = NULL; int ret; if (tracing_is_disabled()) return -ENODEV; /* Make sure the system still exists */ mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) { list_for_each_entry(iter_dir, &iter_tr->systems, list) { if (iter_dir == inode->i_private) { /* Don't open systems with no events */ tr = iter_tr; dir = iter_dir; if (dir->nr_events) { __get_system_dir(dir); system = dir->subsystem; } goto exit_loop; } } } exit_loop: mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); if (!system) return -ENODEV; /* Still need to increment the ref count of the system */ if (trace_array_get(tr) < 0) { put_system(dir); return -ENODEV; } ret = tracing_open_generic(inode, filp); if (ret < 0) { trace_array_put(tr); put_system(dir); } return ret; } static int system_tr_open(struct inode *inode, struct file *filp) { struct trace_subsystem_dir *dir; struct trace_array *tr = inode->i_private; int ret; /* Make a temporary dir that has no system but points to tr */ dir = kzalloc(sizeof(*dir), GFP_KERNEL); if (!dir) return -ENOMEM; ret = tracing_open_generic_tr(inode, filp); if (ret < 0) { kfree(dir); return ret; } dir->tr = tr; filp->private_data = dir; return 0; } static int subsystem_release(struct inode *inode, struct file *file) { struct trace_subsystem_dir *dir = file->private_data; trace_array_put(dir->tr); /* * If dir->subsystem is NULL, then this is a temporary * descriptor that was made for a trace_array to enable * all subsystems. */ if (dir->subsystem) put_system(dir); else kfree(dir); return 0; } static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; struct trace_seq *s; int r; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); print_subsystem_event_filter(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static ssize_t subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; char *buf; int err; if (cnt >= PAGE_SIZE) return -EINVAL; buf = memdup_user_nul(ubuf, cnt); if (IS_ERR(buf)) return PTR_ERR(buf); err = apply_subsystem_event_filter(dir, buf); kfree(buf); if (err < 0) return err; *ppos += cnt; return cnt; } static ssize_t show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; struct trace_seq *s; int r; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); ring_buffer_print_page_header(tr->array_buffer.buffer, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static ssize_t show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_seq *s; int r; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); ring_buffer_print_entry_header(s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static void ignore_task_cpu(void *data) { struct trace_array *tr = data; struct trace_pid_list *pid_list; struct trace_pid_list *no_pid_list; /* * This function is called by on_each_cpu() while the * event_mutex is held. */ pid_list = rcu_dereference_protected(tr->filtered_pids, mutex_is_locked(&event_mutex)); no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, mutex_is_locked(&event_mutex)); this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, current)); } static void register_pid_events(struct trace_array *tr) { /* * Register a probe that is called before all other probes * to set ignore_pid if next or prev do not match. * Register a probe this is called after all other probes * to only keep ignore_pid set if next pid matches. */ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, tr, INT_MAX); register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, tr, 0); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr, 0); register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr, 0); register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr, 0); } static ssize_t event_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos, int type) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; struct trace_pid_list *filtered_pids = NULL; struct trace_pid_list *other_pids = NULL; struct trace_pid_list *pid_list; struct trace_event_file *file; ssize_t ret; if (!cnt) return 0; ret = tracing_update_buffers(tr); if (ret < 0) return ret; guard(mutex)(&event_mutex); if (type == TRACE_PIDS) { filtered_pids = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); other_pids = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); } else { filtered_pids = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); other_pids = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) return ret; if (type == TRACE_PIDS) rcu_assign_pointer(tr->filtered_pids, pid_list); else rcu_assign_pointer(tr->filtered_no_pids, pid_list); list_for_each_entry(file, &tr->events, list) { set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); } if (filtered_pids) { tracepoint_synchronize_unregister(); trace_pid_list_free(filtered_pids); } else if (pid_list && !other_pids) { register_pid_events(tr); } /* * Ignoring of pids is done at task switch. But we have to * check for those tasks that are currently running. * Always do this in case a pid was appended or removed. */ on_each_cpu(ignore_task_cpu, tr, 1); *ppos += ret; return ret; } static ssize_t ftrace_event_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { return event_pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS); } static ssize_t ftrace_event_npid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { return event_pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS); } static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); static int ftrace_event_set_npid_open(struct inode *inode, struct file *file); static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, .show = t_show, .stop = t_stop, }; static const struct seq_operations show_set_event_seq_ops = { .start = s_start, .next = s_next, .show = s_show, .stop = s_stop, }; static const struct seq_operations show_set_pid_seq_ops = { .start = p_start, .next = p_next, .show = trace_pid_show, .stop = p_stop, }; static const struct seq_operations show_set_no_pid_seq_ops = { .start = np_start, .next = np_next, .show = trace_pid_show, .stop = p_stop, }; static const struct file_operations ftrace_avail_fops = { .open = ftrace_event_avail_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations ftrace_set_event_fops = { .open = ftrace_event_set_open, .read = seq_read, .write = ftrace_event_write, .llseek = seq_lseek, .release = ftrace_event_release, }; static const struct file_operations ftrace_set_event_pid_fops = { .open = ftrace_event_set_pid_open, .read = seq_read, .write = ftrace_event_pid_write, .llseek = seq_lseek, .release = ftrace_event_release, }; static const struct file_operations ftrace_set_event_notrace_pid_fops = { .open = ftrace_event_set_npid_open, .read = seq_read, .write = ftrace_event_npid_write, .llseek = seq_lseek, .release = ftrace_event_release, }; static const struct file_operations ftrace_enable_fops = { .open = tracing_open_file_tr, .read = event_enable_read, .write = event_enable_write, .release = tracing_release_file_tr, .llseek = default_llseek, }; static const struct file_operations ftrace_event_format_fops = { .open = trace_format_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; #ifdef CONFIG_PERF_EVENTS static const struct file_operations ftrace_event_id_fops = { .read = event_id_read, .llseek = default_llseek, }; #endif static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_file_tr, .read = event_filter_read, .write = event_filter_write, .release = tracing_release_file_tr, .llseek = default_llseek, }; static const struct file_operations ftrace_subsystem_filter_fops = { .open = subsystem_open, .read = subsystem_filter_read, .write = subsystem_filter_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_system_enable_fops = { .open = subsystem_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_tr_enable_fops = { .open = system_tr_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_show_header_page_fops = { .open = tracing_open_generic_tr, .read = show_header_page_file, .llseek = default_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations ftrace_show_header_event_fops = { .open = tracing_open_generic_tr, .read = show_header_event_file, .llseek = default_llseek, .release = tracing_release_generic_tr, }; static int ftrace_event_open(struct inode *inode, struct file *file, const struct seq_operations *seq_ops) { struct seq_file *m; int ret; ret = security_locked_down(LOCKDOWN_TRACEFS); if (ret) return ret; ret = seq_open(file, seq_ops); if (ret < 0) return ret; m = file->private_data; /* copy tr over to seq ops */ m->private = inode->i_private; return ret; } static int ftrace_event_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return seq_release(inode, file); } static int ftrace_event_avail_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_event_seq_ops; /* Checks for tracefs lockdown */ return ftrace_event_open(inode, file, seq_ops); } static int ftrace_event_set_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_event_seq_ops; struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_events(tr); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) trace_array_put(tr); return ret; } static int ftrace_event_set_pid_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_pid_seq_ops; struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_event_pids(tr, TRACE_PIDS); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) trace_array_put(tr); return ret; } static int ftrace_event_set_npid_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_no_pid_seq_ops; struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_event_pids(tr, TRACE_NO_PIDS); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) trace_array_put(tr); return ret; } static struct event_subsystem * create_new_subsystem(const char *name) { struct event_subsystem *system; /* need to create new entry */ system = kmalloc(sizeof(*system), GFP_KERNEL); if (!system) return NULL; system->ref_count = 1; /* Only allocate if dynamic (kprobes and modules) */ system->name = kstrdup_const(name, GFP_KERNEL); if (!system->name) goto out_free; system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); if (!system->filter) goto out_free; list_add(&system->list, &event_subsystems); return system; out_free: kfree_const(system->name); kfree(system); return NULL; } static int system_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { if (strcmp(name, "filter") == 0) *fops = &ftrace_subsystem_filter_fops; else if (strcmp(name, "enable") == 0) *fops = &ftrace_system_enable_fops; else return 0; *mode = TRACE_MODE_WRITE; return 1; } static struct eventfs_inode * event_subsystem_dir(struct trace_array *tr, const char *name, struct trace_event_file *file, struct eventfs_inode *parent) { struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; struct eventfs_inode *ei; int nr_entries; static struct eventfs_entry system_entries[] = { { .name = "filter", .callback = system_callback, }, { .name = "enable", .callback = system_callback, } }; /* First see if we did not already create this dir */ list_for_each_entry(dir, &tr->systems, list) { system = dir->subsystem; if (strcmp(system->name, name) == 0) { dir->nr_events++; file->system = dir; return dir->ei; } } /* Now see if the system itself exists. */ system = NULL; list_for_each_entry(iter, &event_subsystems, list) { if (strcmp(iter->name, name) == 0) { system = iter; break; } } dir = kmalloc(sizeof(*dir), GFP_KERNEL); if (!dir) goto out_fail; if (!system) { system = create_new_subsystem(name); if (!system) goto out_free; } else __get_system(system); /* ftrace only has directories no files */ if (strcmp(name, "ftrace") == 0) nr_entries = 0; else nr_entries = ARRAY_SIZE(system_entries); ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir); if (IS_ERR(ei)) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } dir->ei = ei; dir->tr = tr; dir->ref_count = 1; dir->nr_events = 1; dir->subsystem = system; file->system = dir; list_add(&dir->list, &tr->systems); return dir->ei; out_free: kfree(dir); out_fail: /* Only print this message if failed on memory allocation */ if (!dir || !system) pr_warn("No memory to create event subsystem %s\n", name); return NULL; } static int event_define_fields(struct trace_event_call *call) { struct list_head *head; int ret = 0; /* * Other events may have the same class. Only update * the fields if they are not already defined. */ head = trace_get_fields(call); if (list_empty(head)) { struct trace_event_fields *field = call->class->fields_array; unsigned int offset = sizeof(struct trace_entry); for (; field->type; field++) { if (field->type == TRACE_FUNCTION_TYPE) { field->define_fields(call); break; } offset = ALIGN(offset, field->align); ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, field->is_signed, field->filter_type, field->len, field->needs_test); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; } offset += field->size; } } return ret; } static int event_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { struct trace_event_file *file = *data; struct trace_event_call *call = file->event_call; if (strcmp(name, "format") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_event_format_fops; return 1; } /* * Only event directories that can be enabled should have * triggers or filters, with the exception of the "print" * event that can have a "trigger" file. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { if (call->class->reg && strcmp(name, "enable") == 0) { *mode = TRACE_MODE_WRITE; *fops = &ftrace_enable_fops; return 1; } if (strcmp(name, "filter") == 0) { *mode = TRACE_MODE_WRITE; *fops = &ftrace_event_filter_fops; return 1; } } if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || strcmp(trace_event_name(call), "print") == 0) { if (strcmp(name, "trigger") == 0) { *mode = TRACE_MODE_WRITE; *fops = &event_trigger_fops; return 1; } } #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg && strcmp(name, "id") == 0) { *mode = TRACE_MODE_READ; *data = (void *)(long)call->event.type; *fops = &ftrace_event_id_fops; return 1; } #endif #ifdef CONFIG_HIST_TRIGGERS if (strcmp(name, "hist") == 0) { *mode = TRACE_MODE_READ; *fops = &event_hist_fops; return 1; } #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG if (strcmp(name, "hist_debug") == 0) { *mode = TRACE_MODE_READ; *fops = &event_hist_debug_fops; return 1; } #endif #ifdef CONFIG_TRACE_EVENT_INJECT if (call->event.type && call->class->reg && strcmp(name, "inject") == 0) { *mode = 0200; *fops = &event_inject_fops; return 1; } #endif return 0; } /* The file is incremented on creation and freeing the enable file decrements it */ static void event_release(const char *name, void *data) { struct trace_event_file *file = data; event_file_put(file); } static int event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; struct eventfs_inode *e_events; struct eventfs_inode *ei; const char *name; int nr_entries; int ret; static struct eventfs_entry event_entries[] = { { .name = "enable", .callback = event_callback, .release = event_release, }, { .name = "filter", .callback = event_callback, }, { .name = "trigger", .callback = event_callback, }, { .name = "format", .callback = event_callback, }, #ifdef CONFIG_PERF_EVENTS { .name = "id", .callback = event_callback, }, #endif #ifdef CONFIG_HIST_TRIGGERS { .name = "hist", .callback = event_callback, }, #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG { .name = "hist_debug", .callback = event_callback, }, #endif #ifdef CONFIG_TRACE_EVENT_INJECT { .name = "inject", .callback = event_callback, }, #endif }; /* * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". This should * never happen. */ if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0)) return -ENODEV; e_events = event_subsystem_dir(tr, call->class->system, file, parent); if (!e_events) return -ENOMEM; nr_entries = ARRAY_SIZE(event_entries); name = trace_event_name(call); ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file); if (IS_ERR(ei)) { pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } file->ei = ei; ret = event_define_fields(call); if (ret < 0) { pr_warn("Could not initialize trace point events/%s\n", name); return ret; } /* Gets decremented on freeing of the "enable" file */ event_file_get(file); return 0; } static void remove_event_from_tracers(struct trace_event_call *call) { struct trace_event_file *file; struct trace_array *tr; do_for_each_event_file_safe(tr, file) { if (file->event_call != call) continue; remove_event_file_dir(file); /* * The do_for_each_event_file_safe() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); } static void event_remove(struct trace_event_call *call) { struct trace_array *tr; struct trace_event_file *file; do_for_each_event_file(tr, file) { if (file->event_call != call) continue; if (file->flags & EVENT_FILE_FL_WAS_ENABLED) tr->clear_trace = true; ftrace_event_enable_disable(file, 0); /* * The do_for_each_event_file() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); if (call->event.funcs) __unregister_trace_event(&call->event); remove_event_from_tracers(call); list_del(&call->list); } static int event_init(struct trace_event_call *call) { int ret = 0; const char *name; name = trace_event_name(call); if (WARN_ON(!name)) return -EINVAL; if (call->class->raw_init) { ret = call->class->raw_init(call); if (ret < 0 && ret != -ENOSYS) pr_warn("Could not initialize trace events/%s\n", name); } return ret; } static int __register_event(struct trace_event_call *call, struct module *mod) { int ret; ret = event_init(call); if (ret < 0) return ret; down_write(&trace_event_sem); list_add(&call->list, &ftrace_events); up_write(&trace_event_sem); if (call->flags & TRACE_EVENT_FL_DYNAMIC) atomic_set(&call->refcnt, 0); else call->module = mod; return 0; } static char *eval_replace(char *ptr, struct trace_eval_map *map, int len) { int rlen; int elen; /* Find the length of the eval value as a string */ elen = snprintf(ptr, 0, "%ld", map->eval_value); /* Make sure there's enough room to replace the string with the value */ if (len < elen) return NULL; snprintf(ptr, elen + 1, "%ld", map->eval_value); /* Get the rest of the string of ptr */ rlen = strlen(ptr + len); memmove(ptr + elen, ptr + len, rlen); /* Make sure we end the new string */ ptr[elen + rlen] = 0; return ptr + elen; } static void update_event_printk(struct trace_event_call *call, struct trace_eval_map *map) { char *ptr; int quote = 0; int len = strlen(map->eval_string); for (ptr = call->print_fmt; *ptr; ptr++) { if (*ptr == '\\') { ptr++; /* paranoid */ if (!*ptr) break; continue; } if (*ptr == '"') { quote ^= 1; continue; } if (quote) continue; if (isdigit(*ptr)) { /* skip numbers */ do { ptr++; /* Check for alpha chars like ULL */ } while (isalnum(*ptr)); if (!*ptr) break; /* * A number must have some kind of delimiter after * it, and we can ignore that too. */ continue; } if (isalpha(*ptr) || *ptr == '_') { if (strncmp(map->eval_string, ptr, len) == 0 && !isalnum(ptr[len]) && ptr[len] != '_') { ptr = eval_replace(ptr, map, len); /* enum/sizeof string smaller than value */ if (WARN_ON_ONCE(!ptr)) return; /* * No need to decrement here, as eval_replace() * returns the pointer to the character passed * the eval, and two evals can not be placed * back to back without something in between. * We can skip that something in between. */ continue; } skip_more: do { ptr++; } while (isalnum(*ptr) || *ptr == '_'); if (!*ptr) break; /* * If what comes after this variable is a '.' or * '->' then we can continue to ignore that string. */ if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { ptr += *ptr == '.' ? 1 : 2; if (!*ptr) break; goto skip_more; } /* * Once again, we can skip the delimiter that came * after the string. */ continue; } } } static void add_str_to_module(struct module *module, char *str) { struct module_string *modstr; modstr = kmalloc(sizeof(*modstr), GFP_KERNEL); /* * If we failed to allocate memory here, then we'll just * let the str memory leak when the module is removed. * If this fails to allocate, there's worse problems than * a leaked string on module removal. */ if (WARN_ON_ONCE(!modstr)) return; modstr->module = module; modstr->str = str; list_add(&modstr->next, &module_strings); } #define ATTRIBUTE_STR "__attribute__(" #define ATTRIBUTE_STR_LEN (sizeof(ATTRIBUTE_STR) - 1) /* Remove all __attribute__() from @type. Return allocated string or @type. */ static char *sanitize_field_type(const char *type) { char *attr, *tmp, *next, *ret = (char *)type; int depth; next = (char *)type; while ((attr = strstr(next, ATTRIBUTE_STR))) { /* Retry if "__attribute__(" is a part of another word. */ if (attr != next && !isspace(attr[-1])) { next = attr + ATTRIBUTE_STR_LEN; continue; } if (ret == type) { ret = kstrdup(type, GFP_KERNEL); if (WARN_ON_ONCE(!ret)) return NULL; attr = ret + (attr - type); } /* the ATTRIBUTE_STR already has the first '(' */ depth = 1; next = attr + ATTRIBUTE_STR_LEN; do { tmp = strpbrk(next, "()"); /* There is unbalanced parentheses */ if (WARN_ON_ONCE(!tmp)) { kfree(ret); return (char *)type; } if (*tmp == '(') depth++; else depth--; next = tmp + 1; } while (depth > 0); next = skip_spaces(next); strcpy(attr, next); next = attr; } return ret; } static char *find_replacable_eval(const char *type, const char *eval_string, int len) { char *ptr; if (!eval_string) return NULL; ptr = strchr(type, '['); if (!ptr) return NULL; ptr++; if (!isalpha(*ptr) && *ptr != '_') return NULL; if (strncmp(eval_string, ptr, len) != 0) return NULL; return ptr; } static void update_event_fields(struct trace_event_call *call, struct trace_eval_map *map) { struct ftrace_event_field *field; const char *eval_string = NULL; struct list_head *head; int len = 0; char *ptr; char *str; /* Dynamic events should never have field maps */ if (call->flags & TRACE_EVENT_FL_DYNAMIC) return; if (map) { eval_string = map->eval_string; len = strlen(map->eval_string); } head = trace_get_fields(call); list_for_each_entry(field, head, link) { str = sanitize_field_type(field->type); if (!str) return; ptr = find_replacable_eval(str, eval_string, len); if (ptr) { if (str == field->type) { str = kstrdup(field->type, GFP_KERNEL); if (WARN_ON_ONCE(!str)) return; ptr = str + (ptr - field->type); } ptr = eval_replace(ptr, map, len); /* enum/sizeof string smaller than value */ if (WARN_ON_ONCE(!ptr)) { kfree(str); continue; } } if (str == field->type) continue; /* * If the event is part of a module, then we need to free the string * when the module is removed. Otherwise, it will stay allocated * until a reboot. */ if (call->module) add_str_to_module(call->module, str); field->type = str; if (field->filter_type == FILTER_OTHER) field->filter_type = filter_assign_type(field->type); } } /* Update all events for replacing eval and sanitizing */ void trace_event_update_all(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; bool first = false; bool updated; int last_i; int i; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { first = true; last_i = 0; last_system = call->class->system; } updated = false; /* * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the * previous call is high. As an optimization, we skip searching * for a map[] that matches the call's system if the last call * was from the same system. That's what last_i is for. If the * call has the same system as the previous call, then last_i * will be the index of the first map[] that has a matching * system. */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ if (first) { last_i = i; first = false; } update_event_printk(call, map[i]); update_event_fields(call, map[i]); updated = true; } } /* If not updated yet, update field for sanitizing. */ if (!updated) update_event_fields(call, NULL); cond_resched(); } up_write(&trace_event_sem); } static bool event_in_systems(struct trace_event_call *call, const char *systems) { const char *system; const char *p; if (!systems) return true; system = call->class->system; p = strstr(systems, system); if (!p) return false; if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',') return false; p += strlen(system); return !*p || isspace(*p) || *p == ','; } #ifdef CONFIG_HIST_TRIGGERS /* * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger * may happen in any context. */ static void hist_poll_event_irq_work(struct irq_work *work) { wake_up_all(&hist_poll_wq); } DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work); DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq); #endif static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; struct trace_event_file *file; unsigned int first; if (!event_in_systems(call, tr->system_names)) return NULL; file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return ERR_PTR(-ENOMEM); pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); if (!trace_pid_list_first(pid_list, &first) || !trace_pid_list_first(no_pid_list, &first)) file->flags |= EVENT_FILE_FL_PID_FILTER; file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); refcount_set(&file->ref, 1); return file; } #define MAX_BOOT_TRIGGERS 32 static struct boot_triggers { const char *event; char *trigger; } bootup_triggers[MAX_BOOT_TRIGGERS]; static char bootup_trigger_buf[COMMAND_LINE_SIZE]; static int nr_boot_triggers; static __init int setup_trace_triggers(char *str) { char *trigger; char *buf; int i; strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event triggers"); buf = bootup_trigger_buf; for (i = 0; i < MAX_BOOT_TRIGGERS; i++) { trigger = strsep(&buf, ","); if (!trigger) break; bootup_triggers[i].event = strsep(&trigger, "."); bootup_triggers[i].trigger = trigger; if (!bootup_triggers[i].trigger) break; } nr_boot_triggers = i; return 1; } __setup("trace_trigger=", setup_trace_triggers); /* Add an event to a trace directory */ static int __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; file = trace_create_new_event(call, tr); /* * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed * allocation, or NULL if the event is not part of the tr->system_names. * When the event is not part of the tr->system_names, return zero, not * an error. */ if (!file) return 0; if (IS_ERR(file)) return PTR_ERR(file); if (eventdir_initialized) return event_create_dir(tr->event_dir, file); else return event_define_fields(call); } static void trace_early_triggers(struct trace_event_file *file, const char *name) { int ret; int i; for (i = 0; i < nr_boot_triggers; i++) { if (strcmp(name, bootup_triggers[i].event)) continue; mutex_lock(&event_mutex); ret = trigger_process_regex(file, bootup_triggers[i].trigger); mutex_unlock(&event_mutex); if (ret) pr_err("Failed to register trigger '%s' on event %s\n", bootup_triggers[i].trigger, bootup_triggers[i].event); } } /* * Just create a descriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ static int __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; int ret; file = trace_create_new_event(call, tr); /* * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed * allocation, or NULL if the event is not part of the tr->system_names. * When the event is not part of the tr->system_names, return zero, not * an error. */ if (!file) return 0; if (IS_ERR(file)) return PTR_ERR(file); ret = event_define_fields(call); if (ret) return ret; trace_early_triggers(file, trace_event_name(call)); return 0; } struct ftrace_module_file_ops; static void __add_event_to_tracers(struct trace_event_call *call); /* Add an additional event_call dynamically */ int trace_add_event_call(struct trace_event_call *call) { int ret; lockdep_assert_held(&event_mutex); guard(mutex)(&trace_types_lock); ret = __register_event(call, NULL); if (ret < 0) return ret; __add_event_to_tracers(call); return ret; } EXPORT_SYMBOL_GPL(trace_add_event_call); /* * Must be called under locking of trace_types_lock, event_mutex and * trace_event_sem. */ static void __trace_remove_event_call(struct trace_event_call *call) { event_remove(call); trace_destroy_fields(call); } static int probe_remove_event_call(struct trace_event_call *call) { struct trace_array *tr; struct trace_event_file *file; #ifdef CONFIG_PERF_EVENTS if (call->perf_refcount) return -EBUSY; #endif do_for_each_event_file(tr, file) { if (file->event_call != call) continue; /* * We can't rely on ftrace_event_enable_disable(enable => 0) * we are going to do, soft mode can suppress * TRACE_REG_UNREGISTER. */ if (file->flags & EVENT_FILE_FL_ENABLED) goto busy; if (file->flags & EVENT_FILE_FL_WAS_ENABLED) tr->clear_trace = true; /* * The do_for_each_event_file_safe() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); __trace_remove_event_call(call); return 0; busy: /* No need to clear the trace now */ list_for_each_entry(tr, &ftrace_trace_arrays, list) { tr->clear_trace = false; } return -EBUSY; } /* Remove an event_call */ int trace_remove_event_call(struct trace_event_call *call) { int ret; lockdep_assert_held(&event_mutex); mutex_lock(&trace_types_lock); down_write(&trace_event_sem); ret = probe_remove_event_call(call); up_write(&trace_event_sem); mutex_unlock(&trace_types_lock); return ret; } EXPORT_SYMBOL_GPL(trace_remove_event_call); #define for_each_event(event, start, end) \ for (event = start; \ (unsigned long)event < (unsigned long)end; \ event++) #ifdef CONFIG_MODULES static void update_mod_cache(struct trace_array *tr, struct module *mod) { struct event_mod_load *event_mod, *n; list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { if (strcmp(event_mod->module, mod->name) != 0) continue; __ftrace_set_clr_event_nolock(tr, event_mod->match, event_mod->system, event_mod->event, 1, mod->name); free_event_mod(event_mod); } } static void update_cache_events(struct module *mod) { struct trace_array *tr; list_for_each_entry(tr, &ftrace_trace_arrays, list) update_mod_cache(tr, mod); } static void trace_module_add_events(struct module *mod) { struct trace_event_call **call, **start, **end; if (!mod->num_trace_events) return; /* Don't add infrastructure for mods without tracepoints */ if (trace_module_has_bad_taint(mod)) { pr_err("%s: module has bad taint, not creating trace events\n", mod->name); return; } start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; for_each_event(call, start, end) { __register_event(*call, mod); __add_event_to_tracers(*call); } update_cache_events(mod); } static void trace_module_remove_events(struct module *mod) { struct trace_event_call *call, *p; struct module_string *modstr, *m; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module) continue; if (call->module == mod) __trace_remove_event_call(call); } /* Check for any strings allocated for this module */ list_for_each_entry_safe(modstr, m, &module_strings, next) { if (modstr->module != mod) continue; list_del(&modstr->next); kfree(modstr->str); kfree(modstr); } up_write(&trace_event_sem); /* * It is safest to reset the ring buffer if the module being unloaded * registered any events that were used. The only worry is if * a new module gets loaded, and takes on the same id as the events * of this module. When printing out the buffer, traced events left * over from this module may be passed to the new module events and * unexpected results may occur. */ tracing_reset_all_online_cpus_unlocked(); } static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); switch (val) { case MODULE_STATE_COMING: trace_module_add_events(mod); break; case MODULE_STATE_GOING: trace_module_remove_events(mod); break; } mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); return NOTIFY_OK; } static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 1, /* higher than trace.c module notify */ }; #endif /* CONFIG_MODULES */ /* Create a new event directory structure for a trace directory. */ static void __trace_add_event_dirs(struct trace_array *tr) { struct trace_event_call *call; int ret; lockdep_assert_held(&trace_event_sem); list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create directory for event %s\n", trace_event_name(call)); } } /* Returns any file that matches the system and event */ struct trace_event_file * __find_event_file(struct trace_array *tr, const char *system, const char *event) { struct trace_event_file *file; struct trace_event_call *call; const char *name; list_for_each_entry(file, &tr->events, list) { call = file->event_call; name = trace_event_name(call); if (!name || !call->class) continue; if (strcmp(event, name) == 0 && strcmp(system, call->class->system) == 0) return file; } return NULL; } /* Returns valid trace event files that match system and event */ struct trace_event_file * find_event_file(struct trace_array *tr, const char *system, const char *event) { struct trace_event_file *file; file = __find_event_file(tr, system, event); if (!file || !file->event_call->class->reg || file->event_call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) return NULL; return file; } /** * trace_get_event_file - Find and return a trace event file * @instance: The name of the trace instance containing the event * @system: The name of the system containing the event * @event: The name of the event * * Return a trace event file given the trace instance name, trace * system, and trace event name. If the instance name is NULL, it * refers to the top-level trace array. * * This function will look it up and return it if found, after calling * trace_array_get() to prevent the instance from going away, and * increment the event's module refcount to prevent it from being * removed. * * To release the file, call trace_put_event_file(), which will call * trace_array_put() and decrement the event's module refcount. * * Return: The trace event on success, ERR_PTR otherwise. */ struct trace_event_file *trace_get_event_file(const char *instance, const char *system, const char *event) { struct trace_array *tr = top_trace_array(); struct trace_event_file *file = NULL; int ret = -EINVAL; if (instance) { tr = trace_array_find_get(instance); if (!tr) return ERR_PTR(-ENOENT); } else { ret = trace_array_get(tr); if (ret) return ERR_PTR(ret); } guard(mutex)(&event_mutex); file = find_event_file(tr, system, event); if (!file) { trace_array_put(tr); return ERR_PTR(-EINVAL); } /* Don't let event modules unload while in use */ ret = trace_event_try_get_ref(file->event_call); if (!ret) { trace_array_put(tr); return ERR_PTR(-EBUSY); } return file; } EXPORT_SYMBOL_GPL(trace_get_event_file); /** * trace_put_event_file - Release a file from trace_get_event_file() * @file: The trace event file * * If a file was retrieved using trace_get_event_file(), this should * be called when it's no longer needed. It will cancel the previous * trace_array_get() called by that function, and decrement the * event's module refcount. */ void trace_put_event_file(struct trace_event_file *file) { mutex_lock(&event_mutex); trace_event_put_ref(file->event_call); mutex_unlock(&event_mutex); trace_array_put(file->tr); } EXPORT_SYMBOL_GPL(trace_put_event_file); #ifdef CONFIG_DYNAMIC_FTRACE /* Avoid typos */ #define ENABLE_EVENT_STR "enable_event" #define DISABLE_EVENT_STR "disable_event" struct event_probe_data { struct trace_event_file *file; unsigned long count; int ref; bool enable; }; static void update_event_probe(struct event_probe_data *data) { if (data->enable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); else set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); } static void event_enable_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (!pdata || !*pdata) return; edata = *pdata; update_event_probe(edata); } static void event_enable_count_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (!pdata || !*pdata) return; edata = *pdata; if (!edata->count) return; /* Skip if the event is in a state we want to switch to */ if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; if (edata->count != -1) (edata->count)--; update_event_probe(edata); } static int event_enable_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (WARN_ON_ONCE(!pdata || !*pdata)) return 0; edata = *pdata; seq_printf(m, "%ps:", (void *)ip); seq_printf(m, "%s:%s:%s", edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, edata->file->event_call->class->system, trace_event_name(edata->file->event_call)); if (edata->count == -1) seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", edata->count); return 0; } static int event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *init_data, void **data) { struct ftrace_func_mapper *mapper = *data; struct event_probe_data *edata = init_data; int ret; if (!mapper) { mapper = allocate_ftrace_func_mapper(); if (!mapper) return -ENODEV; *data = mapper; } ret = ftrace_func_mapper_add_ip(mapper, ip, edata); if (ret < 0) return ret; edata->ref++; return 0; } static int free_probe_data(void *data) { struct event_probe_data *edata = data; edata->ref--; if (!edata->ref) { /* Remove soft mode */ __ftrace_event_enable_disable(edata->file, 0, 1); trace_event_put_ref(edata->file->event_call); kfree(edata); } return 0; } static void event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; if (!ip) { if (!mapper) return; free_ftrace_func_mapper(mapper, free_probe_data); return; } edata = ftrace_func_mapper_remove_ip(mapper, ip); if (WARN_ON_ONCE(!edata)) return; if (WARN_ON_ONCE(edata->ref <= 0)) return; free_probe_data(edata); } static struct ftrace_probe_ops event_enable_probe_ops = { .func = event_enable_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_enable_count_probe_ops = { .func = event_enable_count_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_disable_probe_ops = { .func = event_enable_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_disable_count_probe_ops = { .func = event_enable_count_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static int event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enabled) { struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; unsigned long count = -1; const char *system; const char *event; char *number; bool enable; int ret; if (!tr) return -ENODEV; /* hash funcs only work with set_ftrace_filter */ if (!enabled || !param) return -EINVAL; system = strsep(&param, ":"); if (!param) return -EINVAL; event = strsep(&param, ":"); guard(mutex)(&event_mutex); file = find_event_file(tr, system, event); if (!file) return -EINVAL; enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; if (enable) ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; else ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; if (glob[0] == '!') return unregister_ftrace_function_probe_func(glob+1, tr, ops); if (param) { number = strsep(&param, ":"); if (!strlen(number)) return -EINVAL; /* * We use the callback data field (which is a pointer) * as our counter. */ ret = kstrtoul(number, 0, &count); if (ret) return ret; } /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(file->event_call); if (!ret) return -EBUSY; ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; ret = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out_put; data->enable = enable; data->count = count; data->file = file; ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ /* Just return zero, not the number of enabled functions */ if (ret > 0) return 0; kfree(data); if (!ret) ret = -ENOENT; __ftrace_event_enable_disable(file, 0, 1); out_put: trace_event_put_ref(file->event_call); return ret; } static struct ftrace_func_command event_enable_cmd = { .name = ENABLE_EVENT_STR, .func = event_enable_func, }; static struct ftrace_func_command event_disable_cmd = { .name = DISABLE_EVENT_STR, .func = event_enable_func, }; static __init int register_event_cmds(void) { int ret; ret = register_ftrace_command(&event_enable_cmd); if (WARN_ON(ret < 0)) return ret; ret = register_ftrace_command(&event_disable_cmd); if (WARN_ON(ret < 0)) unregister_ftrace_command(&event_enable_cmd); return ret; } #else static inline int register_event_cmds(void) { return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ /* * The top level array and trace arrays created by boot-time tracing * have already had its trace_event_file descriptors created in order * to allow for early events to be recorded. * This function is called after the tracefs has been initialized, * and we now have to create the files associated to the events. */ static void __trace_early_add_event_dirs(struct trace_array *tr) { struct trace_event_file *file; int ret; list_for_each_entry(file, &tr->events, list) { ret = event_create_dir(tr->event_dir, file); if (ret < 0) pr_warn("Could not create directory for event %s\n", trace_event_name(file->event_call)); } } /* * For early boot up, the top trace array and the trace arrays created * by boot-time tracing require to have a list of events that can be * enabled. This must be done before the filesystem is set up in order * to allow events to be traced early. */ void __trace_early_add_events(struct trace_array *tr) { struct trace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { /* Early boot up should not have any modules loaded */ if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) && WARN_ON_ONCE(call->module)) continue; ret = __trace_early_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create early event %s\n", trace_event_name(call)); } } /* Remove the event directory structure for a trace directory. */ static void __trace_remove_event_dirs(struct trace_array *tr) { struct trace_event_file *file, *next; list_for_each_entry_safe(file, next, &tr->events, list) remove_event_file_dir(file); } static void __add_event_to_tracers(struct trace_event_call *call) { struct trace_array *tr; list_for_each_entry(tr, &ftrace_trace_arrays, list) __trace_add_new_event(call, tr); } extern struct trace_event_call *__start_ftrace_events[]; extern struct trace_event_call *__stop_ftrace_events[]; static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event tracing"); return 1; } __setup("trace_event=", setup_trace_event); static int events_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { if (strcmp(name, "enable") == 0) { *mode = TRACE_MODE_WRITE; *fops = &ftrace_tr_enable_fops; return 1; } if (strcmp(name, "header_page") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_show_header_page_fops; } else if (strcmp(name, "header_event") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_show_header_event_fops; } else return 0; return 1; } /* Expects to have event_mutex held when called */ static int create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { struct eventfs_inode *e_events; struct dentry *entry; int nr_entries; static struct eventfs_entry events_entries[] = { { .name = "enable", .callback = events_callback, }, { .name = "header_page", .callback = events_callback, }, { .name = "header_event", .callback = events_callback, }, }; entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) return -ENOMEM; nr_entries = ARRAY_SIZE(events_entries); e_events = eventfs_create_events_dir("events", parent, events_entries, nr_entries, tr); if (IS_ERR(e_events)) { pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } /* There are not as crucial, just warn if they are not created */ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_pid_fops); trace_create_file("set_event_notrace_pid", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_notrace_pid_fops); tr->event_dir = e_events; return 0; } /** * event_trace_add_tracer - add a instance of a trace_array to events * @parent: The parent dentry to place the files/directories for events in * @tr: The trace array associated with these events * * When a new instance is created, it needs to set up its events * directory, as well as other files associated with events. It also * creates the event hierarchy in the @parent/events directory. * * Returns 0 on success. * * Must be called with event_mutex held. */ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; lockdep_assert_held(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) goto out; down_write(&trace_event_sem); /* If tr already has the event list, it is initialized in early boot. */ if (unlikely(!list_empty(&tr->events))) __trace_early_add_event_dirs(tr); else __trace_add_event_dirs(tr); up_write(&trace_event_sem); out: return ret; } /* * The top trace array already had its file descriptors created. * Now the files themselves need to be created. */ static __init int early_event_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; guard(mutex)(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) return ret; down_write(&trace_event_sem); __trace_early_add_event_dirs(tr); up_write(&trace_event_sem); return 0; } /* Must be called with event_mutex held */ int event_trace_del_tracer(struct trace_array *tr) { lockdep_assert_held(&event_mutex); /* Disable any event triggers and associated soft-disabled events */ clear_event_triggers(tr); /* Clear the pid list */ __ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0, NULL); /* Make sure no more events are being executed */ tracepoint_synchronize_unregister(); down_write(&trace_event_sem); __trace_remove_event_dirs(tr); eventfs_remove_events_dir(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; return 0; } static __init int event_trace_memsetup(void) { field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC); return 0; } __init void early_enable_events(struct trace_array *tr, char *buf, bool disable_first) { char *token; int ret; while (true) { token = strsep(&buf, ","); if (!token) break; if (*token) { /* Restarting syscalls requires that we stop them first */ if (disable_first) ftrace_set_clr_event(tr, token, 0); ret = ftrace_set_clr_event(tr, token, 1); if (ret) pr_warn("Failed to enable trace event: %s\n", token); } /* Put back the comma to allow this to be called again */ if (buf) *(buf - 1) = ','; } } static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); struct trace_event_call **iter, *call; int ret; if (!tr) return -ENODEV; for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { call = *iter; ret = event_init(call); if (!ret) list_add(&call->list, &ftrace_events); } register_trigger_cmds(); /* * We need the top trace array to have a working set of trace * points at early init, before the debug files and directories * are created. Create the file entries now, and attach them * to the actual file dentries later. */ __trace_early_add_events(tr); early_enable_events(tr, bootup_event_buf, false); trace_printk_start_comm(); register_event_cmds(); return 0; } /* * event_trace_enable() is called from trace_event_init() first to * initialize events and perhaps start any events that are on the * command line. Unfortunately, there are some events that will not * start this early, like the system call tracepoints that need * to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But * event_trace_enable() is called before pid 1 starts, and this flag * is never set, making the syscall tracepoint never get reached, but * the event is enabled regardless (and not doing anything). */ static __init int event_trace_enable_again(void) { struct trace_array *tr; tr = top_trace_array(); if (!tr) return -ENODEV; early_enable_events(tr, bootup_event_buf, true); return 0; } early_initcall(event_trace_enable_again); /* Init fields which doesn't related to the tracefs */ static __init int event_trace_init_fields(void) { if (trace_define_generic_fields()) pr_warn("tracing: Failed to allocated generic fields"); if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); return 0; } __init int event_trace_init(void) { struct trace_array *tr; int ret; tr = top_trace_array(); if (!tr) return -ENODEV; trace_create_file("available_events", TRACE_MODE_READ, NULL, tr, &ftrace_avail_fops); ret = early_event_add_tracer(NULL, tr); if (ret) return ret; #ifdef CONFIG_MODULES ret = register_module_notifier(&trace_module_nb); if (ret) pr_warn("Failed to register trace events module notifier\n"); #endif eventdir_initialized = true; return 0; } void __init trace_event_init(void) { event_trace_memsetup(); init_ftrace_syscalls(); event_trace_enable(); event_trace_init_fields(); } #ifdef CONFIG_EVENT_TRACE_STARTUP_TEST static DEFINE_SPINLOCK(test_spinlock); static DEFINE_SPINLOCK(test_spinlock_irq); static DEFINE_MUTEX(test_mutex); static __init void test_work(struct work_struct *dummy) { spin_lock(&test_spinlock); spin_lock_irq(&test_spinlock_irq); udelay(1); spin_unlock_irq(&test_spinlock_irq); spin_unlock(&test_spinlock); mutex_lock(&test_mutex); msleep(1); mutex_unlock(&test_mutex); } static __init int event_test_thread(void *unused) { void *test_malloc; test_malloc = kmalloc(1234, GFP_KERNEL); if (!test_malloc) pr_info("failed to kmalloc\n"); schedule_on_each_cpu(test_work); kfree(test_malloc); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return 0; } /* * Do various things that may trigger events. */ static __init void event_test_stuff(void) { struct task_struct *test_thread; test_thread = kthread_run(event_test_thread, NULL, "test-events"); msleep(1); kthread_stop(test_thread); } /* * For every trace event defined, we will test each trace point separately, * and then by groups, and finally all trace points. */ static __init void event_trace_self_tests(void) { struct trace_subsystem_dir *dir; struct trace_event_file *file; struct trace_event_call *call; struct event_subsystem *system; struct trace_array *tr; int ret; tr = top_trace_array(); if (!tr) return; pr_info("Running tests on trace events:\n"); list_for_each_entry(file, &tr->events, list) { call = file->event_call; /* Only test those that have a probe */ if (!call->class || !call->class->probe) continue; /* * Testing syscall events here is pretty useless, but * we still do it if configured. But this is time consuming. * What we really need is a user thread to perform the * syscalls as we test. */ #ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS if (call->class->system && strcmp(call->class->system, "syscalls") == 0) continue; #endif pr_info("Testing event %s: ", trace_event_name(call)); /* * If an event is already enabled, someone is using * it and the self test should not be on. */ if (file->flags & EVENT_FILE_FL_ENABLED) { pr_warn("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; } ftrace_event_enable_disable(file, 1); event_test_stuff(); ftrace_event_enable_disable(file, 0); pr_cont("OK\n"); } /* Now test at the sub system level */ pr_info("Running tests on trace event systems:\n"); list_for_each_entry(dir, &tr->systems, list) { system = dir->subsystem; /* the ftrace system is special, skip it */ if (strcmp(system->name, "ftrace") == 0) continue; pr_info("Testing event system %s: ", system->name); ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling system %s\n", system->name); continue; } event_test_stuff(); ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling system %s\n", system->name); continue; } pr_cont("OK\n"); } /* Test with all events enabled */ pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling all events\n"); return; } event_test_stuff(); /* reset sysname */ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling all events\n"); return; } pr_cont("OK\n"); } #ifdef CONFIG_FUNCTION_TRACER static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static struct trace_event_file event_trace_file __initdata; static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *regs) { struct trace_buffer *buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; unsigned int trace_ctx; long disabled; int cpu; trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, TRACE_FN, sizeof(*entry), trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; event_trigger_unlock_commit(&event_trace_file, buffer, event, entry, trace_ctx); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); preempt_enable_notrace(); } static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, }; static __init void event_trace_self_test_with_function(void) { int ret; event_trace_file.tr = top_trace_array(); if (WARN_ON(!event_trace_file.tr)) return; ret = register_ftrace_function(&trace_ops); if (WARN_ON(ret < 0)) { pr_info("Failed to enable function tracer for event tests\n"); return; } pr_info("Running tests again, along with the function tracer\n"); event_trace_self_tests(); unregister_ftrace_function(&trace_ops); } #else static __init void event_trace_self_test_with_function(void) { } #endif static __init int event_trace_self_tests_init(void) { if (!tracing_selftest_disabled) { event_trace_self_tests(); event_trace_self_test_with_function(); } return 0; } late_initcall(event_trace_self_tests_init); #endif
2556 2583 66 89 89 67 67 67 66 67 67 113 29 29 113 69 44 69 38 38 38 38 38 38 38 38 25 25 38 38 38 38 38 38 89 38 38 38 38 38 38 38 38 38 37 38 38 36 2 2 155 155 89 89 89 155 113 113 113 113 155 155 155 155 155 89 89 89 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/dd.c - The core device/driver interactions. * * This file contains the (sometimes tricky) code that controls the * interactions between devices and drivers, which primarily includes * driver binding and unbinding. * * All of this code used to exist in drivers/base/bus.c, but was * relocated to here in the name of compartmentalization (since it wasn't * strictly code just for the 'struct bus_type'. * * Copyright (c) 2002-5 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007-2009 Novell Inc. */ #include <linux/debugfs.h> #include <linux/device.h> #include <linux/delay.h> #include <linux/dma-map-ops.h> #include <linux/init.h> #include <linux/module.h> #include <linux/kthread.h> #include <linux/wait.h> #include <linux/async.h> #include <linux/pm_domain.h> #include <linux/pm_runtime.h> #include <linux/pinctrl/devinfo.h> #include <linux/slab.h> #include "base.h" #include "power/power.h" /* * Deferred Probe infrastructure. * * Sometimes driver probe order matters, but the kernel doesn't always have * dependency information which means some drivers will get probed before a * resource it depends on is available. For example, an SDHCI driver may * first need a GPIO line from an i2c GPIO controller before it can be * initialized. If a required resource is not available yet, a driver can * request probing to be deferred by returning -EPROBE_DEFER from its probe hook * * Deferred probe maintains two lists of devices, a pending list and an active * list. A driver returning -EPROBE_DEFER causes the device to be added to the * pending list. A successful driver probe will trigger moving all devices * from the pending to the active list so that the workqueue will eventually * retry them. * * The deferred_probe_mutex must be held any time the deferred_probe_*_list * of the (struct device*)->p->deferred_probe pointers are manipulated */ static DEFINE_MUTEX(deferred_probe_mutex); static LIST_HEAD(deferred_probe_pending_list); static LIST_HEAD(deferred_probe_active_list); static atomic_t deferred_trigger_count = ATOMIC_INIT(0); static bool initcalls_done; /* Save the async probe drivers' name from kernel cmdline */ #define ASYNC_DRV_NAMES_MAX_LEN 256 static char async_probe_drv_names[ASYNC_DRV_NAMES_MAX_LEN]; static bool async_probe_default; /* * In some cases, like suspend to RAM or hibernation, It might be reasonable * to prohibit probing of devices as it could be unsafe. * Once defer_all_probes is true all drivers probes will be forcibly deferred. */ static bool defer_all_probes; static void __device_set_deferred_probe_reason(const struct device *dev, char *reason) { kfree(dev->p->deferred_probe_reason); dev->p->deferred_probe_reason = reason; } /* * deferred_probe_work_func() - Retry probing devices in the active list. */ static void deferred_probe_work_func(struct work_struct *work) { struct device *dev; struct device_private *private; /* * This block processes every device in the deferred 'active' list. * Each device is removed from the active list and passed to * bus_probe_device() to re-attempt the probe. The loop continues * until every device in the active list is removed and retried. * * Note: Once the device is removed from the list and the mutex is * released, it is possible for the device get freed by another thread * and cause a illegal pointer dereference. This code uses * get/put_device() to ensure the device structure cannot disappear * from under our feet. */ mutex_lock(&deferred_probe_mutex); while (!list_empty(&deferred_probe_active_list)) { private = list_first_entry(&deferred_probe_active_list, typeof(*dev->p), deferred_probe); dev = private->device; list_del_init(&private->deferred_probe); get_device(dev); __device_set_deferred_probe_reason(dev, NULL); /* * Drop the mutex while probing each device; the probe path may * manipulate the deferred list */ mutex_unlock(&deferred_probe_mutex); /* * Force the device to the end of the dpm_list since * the PM code assumes that the order we add things to * the list is a good order for suspend but deferred * probe makes that very unsafe. */ device_pm_move_to_tail(dev); dev_dbg(dev, "Retrying from deferred list\n"); bus_probe_device(dev); mutex_lock(&deferred_probe_mutex); put_device(dev); } mutex_unlock(&deferred_probe_mutex); } static DECLARE_WORK(deferred_probe_work, deferred_probe_work_func); void driver_deferred_probe_add(struct device *dev) { if (!dev->can_match) return; mutex_lock(&deferred_probe_mutex); if (list_empty(&dev->p->deferred_probe)) { dev_dbg(dev, "Added to deferred list\n"); list_add_tail(&dev->p->deferred_probe, &deferred_probe_pending_list); } mutex_unlock(&deferred_probe_mutex); } void driver_deferred_probe_del(struct device *dev) { mutex_lock(&deferred_probe_mutex); if (!list_empty(&dev->p->deferred_probe)) { dev_dbg(dev, "Removed from deferred list\n"); list_del_init(&dev->p->deferred_probe); __device_set_deferred_probe_reason(dev, NULL); } mutex_unlock(&deferred_probe_mutex); } static bool driver_deferred_probe_enable; /** * driver_deferred_probe_trigger() - Kick off re-probing deferred devices * * This functions moves all devices from the pending list to the active * list and schedules the deferred probe workqueue to process them. It * should be called anytime a driver is successfully bound to a device. * * Note, there is a race condition in multi-threaded probe. In the case where * more than one device is probing at the same time, it is possible for one * probe to complete successfully while another is about to defer. If the second * depends on the first, then it will get put on the pending list after the * trigger event has already occurred and will be stuck there. * * The atomic 'deferred_trigger_count' is used to determine if a successful * trigger has occurred in the midst of probing a driver. If the trigger count * changes in the midst of a probe, then deferred processing should be triggered * again. */ void driver_deferred_probe_trigger(void) { if (!driver_deferred_probe_enable) return; /* * A successful probe means that all the devices in the pending list * should be triggered to be reprobed. Move all the deferred devices * into the active list so they can be retried by the workqueue */ mutex_lock(&deferred_probe_mutex); atomic_inc(&deferred_trigger_count); list_splice_tail_init(&deferred_probe_pending_list, &deferred_probe_active_list); mutex_unlock(&deferred_probe_mutex); /* * Kick the re-probe thread. It may already be scheduled, but it is * safe to kick it again. */ queue_work(system_dfl_wq, &deferred_probe_work); } /** * device_block_probing() - Block/defer device's probes * * It will disable probing of devices and defer their probes instead. */ void device_block_probing(void) { defer_all_probes = true; /* sync with probes to avoid races. */ wait_for_device_probe(); } /** * device_unblock_probing() - Unblock/enable device's probes * * It will restore normal behavior and trigger re-probing of deferred * devices. */ void device_unblock_probing(void) { defer_all_probes = false; driver_deferred_probe_trigger(); } /** * device_set_deferred_probe_reason() - Set defer probe reason message for device * @dev: the pointer to the struct device * @vaf: the pointer to va_format structure with message */ void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf) { const char *drv = dev_driver_string(dev); char *reason; mutex_lock(&deferred_probe_mutex); reason = kasprintf(GFP_KERNEL, "%s: %pV", drv, vaf); __device_set_deferred_probe_reason(dev, reason); mutex_unlock(&deferred_probe_mutex); } /* * deferred_devs_show() - Show the devices in the deferred probe pending list. */ static int deferred_devs_show(struct seq_file *s, void *data) { struct device_private *curr; mutex_lock(&deferred_probe_mutex); list_for_each_entry(curr, &deferred_probe_pending_list, deferred_probe) seq_printf(s, "%s\t%s", dev_name(curr->device), curr->deferred_probe_reason ?: "\n"); mutex_unlock(&deferred_probe_mutex); return 0; } DEFINE_SHOW_ATTRIBUTE(deferred_devs); #ifdef CONFIG_MODULES static int driver_deferred_probe_timeout = 10; #else static int driver_deferred_probe_timeout; #endif static int __init deferred_probe_timeout_setup(char *str) { int timeout; if (!kstrtoint(str, 10, &timeout)) driver_deferred_probe_timeout = timeout; return 1; } __setup("deferred_probe_timeout=", deferred_probe_timeout_setup); /** * driver_deferred_probe_check_state() - Check deferred probe state * @dev: device to check * * Return: * * -ENODEV if initcalls have completed and modules are disabled. * * -ETIMEDOUT if the deferred probe timeout was set and has expired * and modules are enabled. * * -EPROBE_DEFER in other cases. * * Drivers or subsystems can opt-in to calling this function instead of directly * returning -EPROBE_DEFER. */ int driver_deferred_probe_check_state(struct device *dev) { if (!IS_ENABLED(CONFIG_MODULES) && initcalls_done) { dev_warn(dev, "ignoring dependency for device, assuming no driver\n"); return -ENODEV; } if (!driver_deferred_probe_timeout && initcalls_done) { dev_warn(dev, "deferred probe timeout, ignoring dependency\n"); return -ETIMEDOUT; } return -EPROBE_DEFER; } EXPORT_SYMBOL_GPL(driver_deferred_probe_check_state); static void deferred_probe_timeout_work_func(struct work_struct *work) { struct device_private *p; fw_devlink_drivers_done(); driver_deferred_probe_timeout = 0; driver_deferred_probe_trigger(); flush_work(&deferred_probe_work); mutex_lock(&deferred_probe_mutex); list_for_each_entry(p, &deferred_probe_pending_list, deferred_probe) dev_warn(p->device, "deferred probe pending: %s", p->deferred_probe_reason ?: "(reason unknown)\n"); mutex_unlock(&deferred_probe_mutex); fw_devlink_probing_done(); } static DECLARE_DELAYED_WORK(deferred_probe_timeout_work, deferred_probe_timeout_work_func); void deferred_probe_extend_timeout(void) { /* * If the work hasn't been queued yet or if the work expired, don't * start a new one. */ if (cancel_delayed_work(&deferred_probe_timeout_work)) { schedule_delayed_work(&deferred_probe_timeout_work, driver_deferred_probe_timeout * HZ); pr_debug("Extended deferred probe timeout by %d secs\n", driver_deferred_probe_timeout); } } /** * deferred_probe_initcall() - Enable probing of deferred devices * * We don't want to get in the way when the bulk of drivers are getting probed. * Instead, this initcall makes sure that deferred probing is delayed until * late_initcall time. */ static int deferred_probe_initcall(void) { debugfs_create_file("devices_deferred", 0444, NULL, NULL, &deferred_devs_fops); driver_deferred_probe_enable = true; driver_deferred_probe_trigger(); /* Sort as many dependencies as possible before exiting initcalls */ flush_work(&deferred_probe_work); initcalls_done = true; if (!IS_ENABLED(CONFIG_MODULES)) fw_devlink_drivers_done(); /* * Trigger deferred probe again, this time we won't defer anything * that is optional */ driver_deferred_probe_trigger(); flush_work(&deferred_probe_work); if (driver_deferred_probe_timeout > 0) { schedule_delayed_work(&deferred_probe_timeout_work, driver_deferred_probe_timeout * HZ); } if (!IS_ENABLED(CONFIG_MODULES)) fw_devlink_probing_done(); return 0; } late_initcall(deferred_probe_initcall); static void __exit deferred_probe_exit(void) { debugfs_lookup_and_remove("devices_deferred", NULL); } __exitcall(deferred_probe_exit); /** * device_is_bound() - Check if device is bound to a driver * @dev: device to check * * Returns true if passed device has already finished probing successfully * against a driver. * * This function must be called with the device lock held. */ bool device_is_bound(struct device *dev) { return dev->p && klist_node_attached(&dev->p->knode_driver); } EXPORT_SYMBOL_GPL(device_is_bound); static void driver_bound(struct device *dev) { if (device_is_bound(dev)) { dev_warn(dev, "%s: device already bound\n", __func__); return; } dev_dbg(dev, "driver: '%s': %s: bound to device\n", dev->driver->name, __func__); klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices); device_links_driver_bound(dev); device_pm_check_callbacks(dev); /* * Make sure the device is no longer in one of the deferred lists and * kick off retrying all pending devices */ driver_deferred_probe_del(dev); driver_deferred_probe_trigger(); bus_notify(dev, BUS_NOTIFY_BOUND_DRIVER); kobject_uevent(&dev->kobj, KOBJ_BIND); } static ssize_t coredump_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { device_lock(dev); dev->driver->coredump(dev); device_unlock(dev); return count; } static DEVICE_ATTR_WO(coredump); static int driver_sysfs_add(struct device *dev) { int ret; bus_notify(dev, BUS_NOTIFY_BIND_DRIVER); ret = sysfs_create_link(&dev->driver->p->kobj, &dev->kobj, kobject_name(&dev->kobj)); if (ret) goto fail; ret = sysfs_create_link(&dev->kobj, &dev->driver->p->kobj, "driver"); if (ret) goto rm_dev; if (!IS_ENABLED(CONFIG_DEV_COREDUMP) || !dev->driver->coredump) return 0; ret = device_create_file(dev, &dev_attr_coredump); if (!ret) return 0; sysfs_remove_link(&dev->kobj, "driver"); rm_dev: sysfs_remove_link(&dev->driver->p->kobj, kobject_name(&dev->kobj)); fail: return ret; } static void driver_sysfs_remove(struct device *dev) { struct device_driver *drv = dev->driver; if (drv) { if (drv->coredump) device_remove_file(dev, &dev_attr_coredump); sysfs_remove_link(&drv->p->kobj, kobject_name(&dev->kobj)); sysfs_remove_link(&dev->kobj, "driver"); } } /** * device_bind_driver - bind a driver to one device. * @dev: device. * * Allow manual attachment of a driver to a device. * Caller must have already set @dev->driver. * * Note that this does not modify the bus reference count. * Please verify that is accounted for before calling this. * (It is ok to call with no other effort from a driver's probe() method.) * * This function must be called with the device lock held. * * Callers should prefer to use device_driver_attach() instead. */ int device_bind_driver(struct device *dev) { int ret; ret = driver_sysfs_add(dev); if (!ret) { device_links_force_bind(dev); driver_bound(dev); } else bus_notify(dev, BUS_NOTIFY_DRIVER_NOT_BOUND); return ret; } EXPORT_SYMBOL_GPL(device_bind_driver); static atomic_t probe_count = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(probe_waitqueue); static ssize_t state_synced_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ret = 0; if (strcmp("1", buf)) return -EINVAL; device_lock(dev); if (!dev->state_synced) { dev->state_synced = true; dev_sync_state(dev); } else { ret = -EINVAL; } device_unlock(dev); return ret ? ret : count; } static ssize_t state_synced_show(struct device *dev, struct device_attribute *attr, char *buf) { bool val; device_lock(dev); val = dev->state_synced; device_unlock(dev); return sysfs_emit(buf, "%u\n", val); } static DEVICE_ATTR_RW(state_synced); static void device_unbind_cleanup(struct device *dev) { devres_release_all(dev); arch_teardown_dma_ops(dev); kfree(dev->dma_range_map); dev->dma_range_map = NULL; device_set_driver(dev, NULL); dev_set_drvdata(dev, NULL); dev_pm_domain_detach(dev, dev->power.detach_power_off); if (dev->pm_domain && dev->pm_domain->dismiss) dev->pm_domain->dismiss(dev); pm_runtime_reinit(dev); dev_pm_set_driver_flags(dev, 0); } static void device_remove(struct device *dev) { device_remove_file(dev, &dev_attr_state_synced); device_remove_groups(dev, dev->driver->dev_groups); if (dev->bus && dev->bus->remove) dev->bus->remove(dev); else if (dev->driver->remove) dev->driver->remove(dev); } static int call_driver_probe(struct device *dev, const struct device_driver *drv) { int ret = 0; if (dev->bus->probe) ret = dev->bus->probe(dev); else if (drv->probe) ret = drv->probe(dev); switch (ret) { case 0: break; case -EPROBE_DEFER: /* Driver requested deferred probing */ dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name); break; case -ENODEV: case -ENXIO: dev_dbg(dev, "probe with driver %s rejects match %d\n", drv->name, ret); break; default: /* driver matched but the probe failed */ dev_err(dev, "probe with driver %s failed with error %d\n", drv->name, ret); break; } return ret; } static int really_probe(struct device *dev, const struct device_driver *drv) { bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) && !drv->suppress_bind_attrs; int ret, link_ret; if (defer_all_probes) { /* * Value of defer_all_probes can be set only by * device_block_probing() which, in turn, will call * wait_for_device_probe() right after that to avoid any races. */ dev_dbg(dev, "Driver %s force probe deferral\n", drv->name); return -EPROBE_DEFER; } link_ret = device_links_check_suppliers(dev); if (link_ret == -EPROBE_DEFER) return link_ret; dev_dbg(dev, "bus: '%s': %s: probing driver %s with device\n", drv->bus->name, __func__, drv->name); if (!list_empty(&dev->devres_head)) { dev_crit(dev, "Resources present before probing\n"); ret = -EBUSY; goto done; } re_probe: device_set_driver(dev, drv); /* If using pinctrl, bind pins now before probing */ ret = pinctrl_bind_pins(dev); if (ret) goto pinctrl_bind_failed; if (dev->bus->dma_configure) { ret = dev->bus->dma_configure(dev); if (ret) goto pinctrl_bind_failed; } ret = driver_sysfs_add(dev); if (ret) { dev_err(dev, "%s: driver_sysfs_add failed\n", __func__); goto sysfs_failed; } if (dev->pm_domain && dev->pm_domain->activate) { ret = dev->pm_domain->activate(dev); if (ret) goto probe_failed; } ret = call_driver_probe(dev, drv); if (ret) { /* * If fw_devlink_best_effort is active (denoted by -EAGAIN), the * device might actually probe properly once some of its missing * suppliers have probed. So, treat this as if the driver * returned -EPROBE_DEFER. */ if (link_ret == -EAGAIN) ret = -EPROBE_DEFER; /* * Return probe errors as positive values so that the callers * can distinguish them from other errors. */ ret = -ret; goto probe_failed; } ret = device_add_groups(dev, drv->dev_groups); if (ret) { dev_err(dev, "device_add_groups() failed\n"); goto dev_groups_failed; } if (dev_has_sync_state(dev)) { ret = device_create_file(dev, &dev_attr_state_synced); if (ret) { dev_err(dev, "state_synced sysfs add failed\n"); goto dev_sysfs_state_synced_failed; } } if (test_remove) { test_remove = false; device_remove(dev); driver_sysfs_remove(dev); if (dev->bus && dev->bus->dma_cleanup) dev->bus->dma_cleanup(dev); device_unbind_cleanup(dev); goto re_probe; } pinctrl_init_done(dev); if (dev->pm_domain && dev->pm_domain->sync) dev->pm_domain->sync(dev); driver_bound(dev); dev_dbg(dev, "bus: '%s': %s: bound device to driver %s\n", drv->bus->name, __func__, drv->name); goto done; dev_sysfs_state_synced_failed: dev_groups_failed: device_remove(dev); probe_failed: driver_sysfs_remove(dev); sysfs_failed: bus_notify(dev, BUS_NOTIFY_DRIVER_NOT_BOUND); if (dev->bus && dev->bus->dma_cleanup) dev->bus->dma_cleanup(dev); pinctrl_bind_failed: device_links_no_driver(dev); device_unbind_cleanup(dev); done: return ret; } /* * For initcall_debug, show the driver probe time. */ static int really_probe_debug(struct device *dev, const struct device_driver *drv) { ktime_t calltime, rettime; int ret; calltime = ktime_get(); ret = really_probe(dev, drv); rettime = ktime_get(); /* * Don't change this to pr_debug() because that requires * CONFIG_DYNAMIC_DEBUG and we want a simple 'initcall_debug' on the * kernel commandline to print this all the time at the debug level. */ printk(KERN_DEBUG "probe of %s returned %d after %lld usecs\n", dev_name(dev), ret, ktime_us_delta(rettime, calltime)); return ret; } /** * driver_probe_done * Determine if the probe sequence is finished or not. * * Should somehow figure out how to use a semaphore, not an atomic variable... */ bool __init driver_probe_done(void) { int local_probe_count = atomic_read(&probe_count); pr_debug("%s: probe_count = %d\n", __func__, local_probe_count); return !local_probe_count; } /** * wait_for_device_probe * Wait for device probing to be completed. */ void wait_for_device_probe(void) { /* wait for the deferred probe workqueue to finish */ flush_work(&deferred_probe_work); /* wait for the known devices to complete their probing */ wait_event(probe_waitqueue, atomic_read(&probe_count) == 0); async_synchronize_full(); } EXPORT_SYMBOL_GPL(wait_for_device_probe); static int __driver_probe_device(const struct device_driver *drv, struct device *dev) { int ret = 0; if (dev->p->dead || !device_is_registered(dev)) return -ENODEV; if (dev->driver) return -EBUSY; dev->can_match = true; dev_dbg(dev, "bus: '%s': %s: matched device with driver %s\n", drv->bus->name, __func__, drv->name); pm_runtime_get_suppliers(dev); if (dev->parent) pm_runtime_get_sync(dev->parent); pm_runtime_barrier(dev); if (initcall_debug) ret = really_probe_debug(dev, drv); else ret = really_probe(dev, drv); pm_request_idle(dev); if (dev->parent) pm_runtime_put(dev->parent); pm_runtime_put_suppliers(dev); return ret; } /** * driver_probe_device - attempt to bind device & driver together * @drv: driver to bind a device to * @dev: device to try to bind to the driver * * This function returns -ENODEV if the device is not registered, -EBUSY if it * already has a driver, 0 if the device is bound successfully and a positive * (inverted) error code for failures from the ->probe method. * * This function must be called with @dev lock held. When called for a * USB interface, @dev->parent lock must be held as well. * * If the device has a parent, runtime-resume the parent before driver probing. */ static int driver_probe_device(const struct device_driver *drv, struct device *dev) { int trigger_count = atomic_read(&deferred_trigger_count); int ret; atomic_inc(&probe_count); ret = __driver_probe_device(drv, dev); if (ret == -EPROBE_DEFER || ret == EPROBE_DEFER) { driver_deferred_probe_add(dev); /* * Did a trigger occur while probing? Need to re-trigger if yes */ if (trigger_count != atomic_read(&deferred_trigger_count) && !defer_all_probes) driver_deferred_probe_trigger(); } atomic_dec(&probe_count); wake_up_all(&probe_waitqueue); return ret; } static inline bool cmdline_requested_async_probing(const char *drv_name) { bool async_drv; async_drv = parse_option_str(async_probe_drv_names, drv_name); return (async_probe_default != async_drv); } /* The option format is "driver_async_probe=drv_name1,drv_name2,..." */ static int __init save_async_options(char *buf) { if (strlen(buf) >= ASYNC_DRV_NAMES_MAX_LEN) pr_warn("Too long list of driver names for 'driver_async_probe'!\n"); strscpy(async_probe_drv_names, buf, ASYNC_DRV_NAMES_MAX_LEN); async_probe_default = parse_option_str(async_probe_drv_names, "*"); return 1; } __setup("driver_async_probe=", save_async_options); static bool driver_allows_async_probing(const struct device_driver *drv) { switch (drv->probe_type) { case PROBE_PREFER_ASYNCHRONOUS: return true; case PROBE_FORCE_SYNCHRONOUS: return false; default: if (cmdline_requested_async_probing(drv->name)) return true; if (module_requested_async_probing(drv->owner)) return true; return false; } } struct device_attach_data { struct device *dev; /* * Indicates whether we are considering asynchronous probing or * not. Only initial binding after device or driver registration * (including deferral processing) may be done asynchronously, the * rest is always synchronous, as we expect it is being done by * request from userspace. */ bool check_async; /* * Indicates if we are binding synchronous or asynchronous drivers. * When asynchronous probing is enabled we'll execute 2 passes * over drivers: first pass doing synchronous probing and second * doing asynchronous probing (if synchronous did not succeed - * most likely because there was no driver requiring synchronous * probing - and we found asynchronous driver during first pass). * The 2 passes are done because we can't shoot asynchronous * probe for given device and driver from bus_for_each_drv() since * driver pointer is not guaranteed to stay valid once * bus_for_each_drv() iterates to the next driver on the bus. */ bool want_async; /* * We'll set have_async to 'true' if, while scanning for matching * driver, we'll encounter one that requests asynchronous probing. */ bool have_async; }; static int __device_attach_driver(struct device_driver *drv, void *_data) { struct device_attach_data *data = _data; struct device *dev = data->dev; bool async_allowed; int ret; ret = driver_match_device(drv, dev); if (ret == 0) { /* no match */ return 0; } else if (ret == -EPROBE_DEFER) { dev_dbg(dev, "Device match requests probe deferral\n"); dev->can_match = true; driver_deferred_probe_add(dev); /* * Device can't match with a driver right now, so don't attempt * to match or bind with other drivers on the bus. */ return ret; } else if (ret < 0) { dev_dbg(dev, "Bus failed to match device: %d\n", ret); return ret; } /* ret > 0 means positive match */ async_allowed = driver_allows_async_probing(drv); if (async_allowed) data->have_async = true; if (data->check_async && async_allowed != data->want_async) return 0; /* * Ignore errors returned by ->probe so that the next driver can try * its luck. */ ret = driver_probe_device(drv, dev); if (ret < 0) return ret; return ret == 0; } static void __device_attach_async_helper(void *_dev, async_cookie_t cookie) { struct device *dev = _dev; struct device_attach_data data = { .dev = dev, .check_async = true, .want_async = true, }; device_lock(dev); /* * Check if device has already been removed or claimed. This may * happen with driver loading, device discovery/registration, * and deferred probe processing happens all at once with * multiple threads. */ if (dev->p->dead || dev->driver) goto out_unlock; if (dev->parent) pm_runtime_get_sync(dev->parent); bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver); dev_dbg(dev, "async probe completed\n"); pm_request_idle(dev); if (dev->parent) pm_runtime_put(dev->parent); out_unlock: device_unlock(dev); put_device(dev); } static int __device_attach(struct device *dev, bool allow_async) { int ret = 0; bool async = false; device_lock(dev); if (dev->p->dead) { goto out_unlock; } else if (dev->driver) { if (device_is_bound(dev)) { ret = 1; goto out_unlock; } ret = device_bind_driver(dev); if (ret == 0) ret = 1; else { device_set_driver(dev, NULL); ret = 0; } } else { struct device_attach_data data = { .dev = dev, .check_async = allow_async, .want_async = false, }; if (dev->parent) pm_runtime_get_sync(dev->parent); ret = bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver); if (!ret && allow_async && data.have_async) { /* * If we could not find appropriate driver * synchronously and we are allowed to do * async probes and there are drivers that * want to probe asynchronously, we'll * try them. */ dev_dbg(dev, "scheduling asynchronous probe\n"); get_device(dev); async = true; } else { pm_request_idle(dev); } if (dev->parent) pm_runtime_put(dev->parent); } out_unlock: device_unlock(dev); if (async) async_schedule_dev(__device_attach_async_helper, dev); return ret; } /** * device_attach - try to attach device to a driver. * @dev: device. * * Walk the list of drivers that the bus has and call * driver_probe_device() for each pair. If a compatible * pair is found, break out and return. * * Returns 1 if the device was bound to a driver; * 0 if no matching driver was found; * -ENODEV if the device is not registered. * * When called for a USB interface, @dev->parent lock must be held. */ int device_attach(struct device *dev) { return __device_attach(dev, false); } EXPORT_SYMBOL_GPL(device_attach); void device_initial_probe(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); if (!sp) return; if (sp->drivers_autoprobe) __device_attach(dev, true); subsys_put(sp); } /* * __device_driver_lock - acquire locks needed to manipulate dev->drv * @dev: Device we will update driver info for * @parent: Parent device. Needed if the bus requires parent lock * * This function will take the required locks for manipulating dev->drv. * Normally this will just be the @dev lock, but when called for a USB * interface, @parent lock will be held as well. */ static void __device_driver_lock(struct device *dev, struct device *parent) { if (parent && dev->bus->need_parent_lock) device_lock(parent); device_lock(dev); } /* * __device_driver_unlock - release locks needed to manipulate dev->drv * @dev: Device we will update driver info for * @parent: Parent device. Needed if the bus requires parent lock * * This function will release the required locks for manipulating dev->drv. * Normally this will just be the @dev lock, but when called for a * USB interface, @parent lock will be released as well. */ static void __device_driver_unlock(struct device *dev, struct device *parent) { device_unlock(dev); if (parent && dev->bus->need_parent_lock) device_unlock(parent); } /** * device_driver_attach - attach a specific driver to a specific device * @drv: Driver to attach * @dev: Device to attach it to * * Manually attach driver to a device. Will acquire both @dev lock and * @dev->parent lock if needed. Returns 0 on success, -ERR on failure. */ int device_driver_attach(const struct device_driver *drv, struct device *dev) { int ret; __device_driver_lock(dev, dev->parent); ret = __driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); /* also return probe errors as normal negative errnos */ if (ret > 0) ret = -ret; if (ret == -EPROBE_DEFER) return -EAGAIN; return ret; } EXPORT_SYMBOL_GPL(device_driver_attach); static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie) { struct device *dev = _dev; const struct device_driver *drv; int ret; __device_driver_lock(dev, dev->parent); drv = dev->p->async_driver; dev->p->async_driver = NULL; ret = driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); dev_dbg(dev, "driver %s async attach completed: %d\n", drv->name, ret); put_device(dev); } static int __driver_attach(struct device *dev, void *data) { const struct device_driver *drv = data; bool async = false; int ret; /* * Lock device and try to bind to it. We drop the error * here and always return 0, because we need to keep trying * to bind to devices and some drivers will return an error * simply if it didn't support the device. * * driver_probe_device() will spit a warning if there * is an error. */ ret = driver_match_device(drv, dev); if (ret == 0) { /* no match */ return 0; } else if (ret == -EPROBE_DEFER) { dev_dbg(dev, "Device match requests probe deferral\n"); dev->can_match = true; driver_deferred_probe_add(dev); /* * Driver could not match with device, but may match with * another device on the bus. */ return 0; } else if (ret < 0) { dev_dbg(dev, "Bus failed to match device: %d\n", ret); /* * Driver could not match with device, but may match with * another device on the bus. */ return 0; } /* ret > 0 means positive match */ if (driver_allows_async_probing(drv)) { /* * Instead of probing the device synchronously we will * probe it asynchronously to allow for more parallelism. * * We only take the device lock here in order to guarantee * that the dev->driver and async_driver fields are protected */ dev_dbg(dev, "probing driver %s asynchronously\n", drv->name); device_lock(dev); if (!dev->driver && !dev->p->async_driver) { get_device(dev); dev->p->async_driver = drv; async = true; } device_unlock(dev); if (async) async_schedule_dev(__driver_attach_async_helper, dev); return 0; } __device_driver_lock(dev, dev->parent); driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); return 0; } /** * driver_attach - try to bind driver to devices. * @drv: driver. * * Walk the list of devices that the bus has on it and try to * match the driver with each one. If driver_probe_device() * returns 0 and the @dev->driver is set, we've found a * compatible pair. */ int driver_attach(const struct device_driver *drv) { /* The (void *) will be put back to const * in __driver_attach() */ return bus_for_each_dev(drv->bus, NULL, (void *)drv, __driver_attach); } EXPORT_SYMBOL_GPL(driver_attach); /* * __device_release_driver() must be called with @dev lock held. * When called for a USB interface, @dev->parent lock must be held as well. */ static void __device_release_driver(struct device *dev, struct device *parent) { struct device_driver *drv; drv = dev->driver; if (drv) { pm_runtime_get_sync(dev); while (device_links_busy(dev)) { __device_driver_unlock(dev, parent); device_links_unbind_consumers(dev); __device_driver_lock(dev, parent); /* * A concurrent invocation of the same function might * have released the driver successfully while this one * was waiting, so check for that. */ if (dev->driver != drv) { pm_runtime_put(dev); return; } } driver_sysfs_remove(dev); bus_notify(dev, BUS_NOTIFY_UNBIND_DRIVER); pm_runtime_put_sync(dev); device_remove(dev); if (dev->bus && dev->bus->dma_cleanup) dev->bus->dma_cleanup(dev); device_unbind_cleanup(dev); device_links_driver_cleanup(dev); klist_remove(&dev->p->knode_driver); device_pm_check_callbacks(dev); bus_notify(dev, BUS_NOTIFY_UNBOUND_DRIVER); kobject_uevent(&dev->kobj, KOBJ_UNBIND); } } void device_release_driver_internal(struct device *dev, const struct device_driver *drv, struct device *parent) { __device_driver_lock(dev, parent); if (!drv || drv == dev->driver) __device_release_driver(dev, parent); __device_driver_unlock(dev, parent); } /** * device_release_driver - manually detach device from driver. * @dev: device. * * Manually detach device from driver. * When called for a USB interface, @dev->parent lock must be held. * * If this function is to be called with @dev->parent lock held, ensure that * the device's consumers are unbound in advance or that their locks can be * acquired under the @dev->parent lock. */ void device_release_driver(struct device *dev) { /* * If anyone calls device_release_driver() recursively from * within their ->remove callback for the same device, they * will deadlock right here. */ device_release_driver_internal(dev, NULL, NULL); } EXPORT_SYMBOL_GPL(device_release_driver); /** * device_driver_detach - detach driver from a specific device * @dev: device to detach driver from * * Detach driver from device. Will acquire both @dev lock and @dev->parent * lock if needed. */ void device_driver_detach(struct device *dev) { device_release_driver_internal(dev, NULL, dev->parent); } /** * driver_detach - detach driver from all devices it controls. * @drv: driver. */ void driver_detach(const struct device_driver *drv) { struct device_private *dev_prv; struct device *dev; if (driver_allows_async_probing(drv)) async_synchronize_full(); for (;;) { spin_lock(&drv->p->klist_devices.k_lock); if (list_empty(&drv->p->klist_devices.k_list)) { spin_unlock(&drv->p->klist_devices.k_lock); break; } dev_prv = list_last_entry(&drv->p->klist_devices.k_list, struct device_private, knode_driver.n_node); dev = dev_prv->device; get_device(dev); spin_unlock(&drv->p->klist_devices.k_lock); device_release_driver_internal(dev, drv, dev->parent); put_device(dev); } }
2127 711 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM mmap #if !defined(_TRACE_MMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MMAP_H #include <linux/tracepoint.h> TRACE_EVENT(vm_unmapped_area, TP_PROTO(unsigned long addr, struct vm_unmapped_area_info *info), TP_ARGS(addr, info), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, total_vm) __field(unsigned long, flags) __field(unsigned long, length) __field(unsigned long, low_limit) __field(unsigned long, high_limit) __field(unsigned long, align_mask) __field(unsigned long, align_offset) ), TP_fast_assign( __entry->addr = addr; __entry->total_vm = current->mm->total_vm; __entry->flags = info->flags; __entry->length = info->length; __entry->low_limit = info->low_limit; __entry->high_limit = info->high_limit; __entry->align_mask = info->align_mask; __entry->align_offset = info->align_offset; ), TP_printk("addr=0x%lx err=%ld total_vm=0x%lx flags=0x%lx len=0x%lx lo=0x%lx hi=0x%lx mask=0x%lx ofs=0x%lx", IS_ERR_VALUE(__entry->addr) ? 0 : __entry->addr, IS_ERR_VALUE(__entry->addr) ? __entry->addr : 0, __entry->total_vm, __entry->flags, __entry->length, __entry->low_limit, __entry->high_limit, __entry->align_mask, __entry->align_offset) ); TRACE_EVENT(exit_mmap, TP_PROTO(struct mm_struct *mm), TP_ARGS(mm), TP_STRUCT__entry( __field(struct mm_struct *, mm) __field(struct maple_tree *, mt) ), TP_fast_assign( __entry->mm = mm; __entry->mt = &mm->mm_mt; ), TP_printk("mt_mod %p, DESTROY", __entry->mt ) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h>
3956 3958 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 // SPDX-License-Identifier: GPL-2.0 /* * Device physical location support * * Author: Won Chung <wonchung@google.com> */ #include <linux/acpi.h> #include <linux/sysfs.h> #include <linux/string_choices.h> #include "physical_location.h" bool dev_add_physical_location(struct device *dev) { struct acpi_pld_info *pld; if (!has_acpi_companion(dev)) return false; if (!acpi_get_physical_device_location(ACPI_HANDLE(dev), &pld)) return false; dev->physical_location = kzalloc(sizeof(*dev->physical_location), GFP_KERNEL); if (!dev->physical_location) { ACPI_FREE(pld); return false; } dev->physical_location->panel = pld->panel; dev->physical_location->vertical_position = pld->vertical_position; dev->physical_location->horizontal_position = pld->horizontal_position; dev->physical_location->dock = pld->dock; dev->physical_location->lid = pld->lid; ACPI_FREE(pld); return true; } static ssize_t panel_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *panel; switch (dev->physical_location->panel) { case DEVICE_PANEL_TOP: panel = "top"; break; case DEVICE_PANEL_BOTTOM: panel = "bottom"; break; case DEVICE_PANEL_LEFT: panel = "left"; break; case DEVICE_PANEL_RIGHT: panel = "right"; break; case DEVICE_PANEL_FRONT: panel = "front"; break; case DEVICE_PANEL_BACK: panel = "back"; break; default: panel = "unknown"; } return sysfs_emit(buf, "%s\n", panel); } static DEVICE_ATTR_RO(panel); static ssize_t vertical_position_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *vertical_position; switch (dev->physical_location->vertical_position) { case DEVICE_VERT_POS_UPPER: vertical_position = "upper"; break; case DEVICE_VERT_POS_CENTER: vertical_position = "center"; break; case DEVICE_VERT_POS_LOWER: vertical_position = "lower"; break; default: vertical_position = "unknown"; } return sysfs_emit(buf, "%s\n", vertical_position); } static DEVICE_ATTR_RO(vertical_position); static ssize_t horizontal_position_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *horizontal_position; switch (dev->physical_location->horizontal_position) { case DEVICE_HORI_POS_LEFT: horizontal_position = "left"; break; case DEVICE_HORI_POS_CENTER: horizontal_position = "center"; break; case DEVICE_HORI_POS_RIGHT: horizontal_position = "right"; break; default: horizontal_position = "unknown"; } return sysfs_emit(buf, "%s\n", horizontal_position); } static DEVICE_ATTR_RO(horizontal_position); static ssize_t dock_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_yes_no(dev->physical_location->dock)); } static DEVICE_ATTR_RO(dock); static ssize_t lid_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_yes_no(dev->physical_location->lid)); } static DEVICE_ATTR_RO(lid); static struct attribute *dev_attr_physical_location[] = { &dev_attr_panel.attr, &dev_attr_vertical_position.attr, &dev_attr_horizontal_position.attr, &dev_attr_dock.attr, &dev_attr_lid.attr, NULL, }; const struct attribute_group dev_attr_physical_location_group = { .name = "physical_location", .attrs = dev_attr_physical_location, };
69 71 69 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 // SPDX-License-Identifier: GPL-2.0-only /* * llc_output.c - LLC minimal output path * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/llc.h> #include <net/llc_pdu.h> /** * llc_mac_hdr_init - fills MAC header fields * @skb: Address of the frame to initialize its MAC header * @sa: The MAC source address * @da: The MAC destination address * * Fills MAC header fields, depending on MAC type. Returns 0, If MAC type * is a valid type and initialization completes correctly 1, otherwise. */ int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa, const unsigned char *da) { int rc = -EINVAL; switch (skb->dev->type) { case ARPHRD_ETHER: case ARPHRD_LOOPBACK: rc = dev_hard_header(skb, skb->dev, ETH_P_802_2, da, sa, skb->len); if (rc > 0) rc = 0; break; default: break; } return rc; } /** * llc_build_and_send_ui_pkt - unitdata request interface for upper layers * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * Upper layers calls this function when upper layer wants to send data * using connection-less mode communication (UI pdu). * * Accept data frame from network layer to be sent using connection- * less mode communication; timeout/retries handled by network layer; * package primitive as an event and send to SAP event handler */ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, const unsigned char *dmac, unsigned char dsap) { int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(skb); rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac); if (likely(!rc)) rc = dev_queue_xmit(skb); else kfree_skb(skb); return rc; } EXPORT_SYMBOL(llc_mac_hdr_init); EXPORT_SYMBOL(llc_build_and_send_ui_pkt);
22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 /* inflate.c -- zlib decompression * Copyright (C) 1995-2005 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Based on zlib 1.2.3 but modified for the Linux Kernel by * Richard Purdie <richard@openedhand.com> * * Changes mainly for static instead of dynamic memory allocation * */ #include <linux/zutil.h> #include "inftrees.h" #include "inflate.h" #include "inffast.h" #include "infutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_inflate.h" #else #define INFLATE_RESET_HOOK(strm) do {} while (0) #define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0) #define INFLATE_NEED_UPDATEWINDOW(strm) 1 #define INFLATE_NEED_CHECKSUM(strm) 1 #endif int zlib_inflate_workspacesize(void) { return sizeof(struct inflate_workspace); } int zlib_inflateReset(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; strm->total_in = strm->total_out = state->total = 0; strm->msg = NULL; strm->adler = 1; /* to support ill-conceived Java test suite */ state->mode = HEAD; state->last = 0; state->havedict = 0; state->dmax = 32768U; state->hold = 0; state->bits = 0; state->lencode = state->distcode = state->next = state->codes; /* Initialise Window */ state->wsize = 1U << state->wbits; state->write = 0; state->whave = 0; INFLATE_RESET_HOOK(strm); return Z_OK; } int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; /* in case we return an error */ state = &WS(strm)->inflate_state; strm->state = (struct internal_state *)state; if (windowBits < 0) { state->wrap = 0; windowBits = -windowBits; } else { state->wrap = (windowBits >> 4) + 1; } if (windowBits < 8 || windowBits > 15) { return Z_STREAM_ERROR; } state->wbits = (unsigned)windowBits; #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE); #else state->window = &WS(strm)->working_window[0]; #endif return zlib_inflateReset(strm); } /* Return state with length and distance decoding tables and index sizes set to fixed code decoding. This returns fixed tables from inffixed.h. */ static void zlib_fixedtables(struct inflate_state *state) { # include "inffixed.h" state->lencode = lenfix; state->lenbits = 9; state->distcode = distfix; state->distbits = 5; } /* Update the window with the last wsize (normally 32K) bytes written before returning. This is only called when a window is already in use, or when output has been written during this inflate call, but the end of the deflate stream has not been reached yet. It is also called to window dictionary data when a dictionary is loaded. Providing output buffers larger than 32K to inflate() should provide a speed advantage, since only the last 32K of output is copied to the sliding window upon return from inflate(), and since all distances after the first 32K of output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ static void zlib_updatewindow(z_streamp strm, unsigned out) { struct inflate_state *state; unsigned copy, dist; state = (struct inflate_state *)strm->state; /* copy state->wsize or less output bytes into the circular window */ copy = out - strm->avail_out; if (copy >= state->wsize) { memcpy(state->window, strm->next_out - state->wsize, state->wsize); state->write = 0; state->whave = state->wsize; } else { dist = state->wsize - state->write; if (dist > copy) dist = copy; memcpy(state->window + state->write, strm->next_out - copy, dist); copy -= dist; if (copy) { memcpy(state->window, strm->next_out - copy, copy); state->write = copy; state->whave = state->wsize; } else { state->write += dist; if (state->write == state->wsize) state->write = 0; if (state->whave < state->wsize) state->whave += dist; } } } /* * At the end of a Deflate-compressed PPP packet, we expect to have seen * a `stored' block type value but not the (zero) length bytes. */ /* Returns true if inflate is currently at the end of a block generated by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored block. When decompressing, PPP checks that at the end of input packet, inflate is waiting for these length bytes. */ static int zlib_inflateSyncPacket(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == STORED && state->bits == 0) { state->mode = TYPE; return Z_OK; } return Z_DATA_ERROR; } /* Macros for inflate(): */ /* check function to use adler32() for zlib or crc32() for gzip */ #define UPDATE(check, buf, len) zlib_adler32(check, buf, len) /* Load registers with state in inflate() for speed */ #define LOAD() \ do { \ put = strm->next_out; \ left = strm->avail_out; \ next = strm->next_in; \ have = strm->avail_in; \ hold = state->hold; \ bits = state->bits; \ } while (0) /* Restore state from registers in inflate() */ #define RESTORE() \ do { \ strm->next_out = put; \ strm->avail_out = left; \ strm->next_in = next; \ strm->avail_in = have; \ state->hold = hold; \ state->bits = bits; \ } while (0) /* Clear the input bit accumulator */ #define INITBITS() \ do { \ hold = 0; \ bits = 0; \ } while (0) /* Get a byte of input into the bit accumulator, or return from inflate() if there is no input available. */ #define PULLBYTE() \ do { \ if (have == 0) goto inf_leave; \ have--; \ hold += (unsigned long)(*next++) << bits; \ bits += 8; \ } while (0) /* Assure that there are at least n bits in the bit accumulator. If there is not enough available input to do that, then return from inflate(). */ #define NEEDBITS(n) \ do { \ while (bits < (unsigned)(n)) \ PULLBYTE(); \ } while (0) /* Return the low n bits of the bit accumulator (n < 16) */ #define BITS(n) \ ((unsigned)hold & ((1U << (n)) - 1)) /* Remove n bits from the bit accumulator */ #define DROPBITS(n) \ do { \ hold >>= (n); \ bits -= (unsigned)(n); \ } while (0) /* Remove zero to seven bits as needed to go to a byte boundary */ #define BYTEBITS() \ do { \ hold >>= bits & 7; \ bits -= bits & 7; \ } while (0) /* inflate() uses a state machine to process as much input data and generate as much output data as possible before returning. The state machine is structured roughly as follows: for (;;) switch (state) { ... case STATEn: if (not enough input data or output space to make progress) return; ... make progress ... state = STATEm; break; ... } so when inflate() is called again, the same case is attempted again, and if the appropriate resources are provided, the machine proceeds to the next state. The NEEDBITS() macro is usually the way the state evaluates whether it can proceed or should return. NEEDBITS() does the return if the requested bits are not available. The typical use of the BITS macros is: NEEDBITS(n); ... do something with BITS(n) ... DROPBITS(n); where NEEDBITS(n) either returns from inflate() if there isn't enough input left to load n bits into the accumulator, or it continues. BITS(n) gives the low n bits in the accumulator. When done, DROPBITS(n) drops the low n bits off the accumulator. INITBITS() clears the accumulator and sets the number of available bits to zero. BYTEBITS() discards just enough bits to put the accumulator on a byte boundary. After BYTEBITS() and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return if there is no input available. The decoding of variable length codes uses PULLBYTE() directly in order to pull just enough bytes to decode the next code, and no more. Some states loop until they get enough input, making sure that enough state information is maintained to continue the loop where it left off if NEEDBITS() returns in the loop. For example, want, need, and keep would all have to actually be part of the saved state in case NEEDBITS() returns: case STATEw: while (want < need) { NEEDBITS(n); keep[want++] = BITS(n); DROPBITS(n); } state = STATEx; case STATEx: As shown above, if the next state is also the next case, then the break is omitted. A state may also return if there is not enough output space available to complete that state. Those states are copying stored data, writing a literal byte, and copying a matching string. When returning, a "goto inf_leave" is used to update the total counters, update the check value, and determine whether any progress has been made during that inflate() call in order to return the proper return code. Progress is defined as a change in either strm->avail_in or strm->avail_out. When there is a window, goto inf_leave will update the window with the last output written. If a goto inf_leave occurs in the middle of decompression and there is no window currently, goto inf_leave will create one and copy output to the window for the next call of inflate(). In this implementation, the flush parameter of inflate() only affects the return code (per zlib.h). inflate() always writes as much as possible to strm->next_out, given the space available and the provided input--the effect documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers the allocation of and copying into a sliding window until necessary, which provides the effect documented in zlib.h for Z_FINISH when the entire input stream available. So the only thing the flush parameter actually does is: when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it will return Z_BUF_ERROR if it has not reached the end of the stream. */ int zlib_inflate(z_streamp strm, int flush) { struct inflate_state *state; const unsigned char *next; /* next input */ unsigned char *put; /* next output */ unsigned have, left; /* available input and output */ unsigned long hold; /* bit buffer */ unsigned bits; /* bits in bit buffer */ unsigned in, out; /* save starting available input and output */ unsigned copy; /* number of stored or match bytes to copy */ unsigned char *from; /* where to copy match bytes from */ code this; /* current decoding table entry */ code last; /* parent table entry */ unsigned len; /* length to copy for repeats, bits to drop */ int ret; /* return code */ static const unsigned short order[19] = /* permutation of code lengths */ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; /* Do not check for strm->next_out == NULL here as ppc zImage inflates to strm->next_out = 0 */ if (strm == NULL || strm->state == NULL || (strm->next_in == NULL && strm->avail_in != 0)) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ LOAD(); in = have; out = left; ret = Z_OK; for (;;) switch (state->mode) { case HEAD: if (state->wrap == 0) { state->mode = TYPEDO; break; } NEEDBITS(16); if ( ((BITS(8) << 8) + (hold >> 8)) % 31) { strm->msg = (char *)"incorrect header check"; state->mode = BAD; break; } if (BITS(4) != Z_DEFLATED) { strm->msg = (char *)"unknown compression method"; state->mode = BAD; break; } DROPBITS(4); len = BITS(4) + 8; if (len > state->wbits) { strm->msg = (char *)"invalid window size"; state->mode = BAD; break; } state->dmax = 1U << len; strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); break; case DICTID: NEEDBITS(32); strm->adler = state->check = REVERSE(hold); INITBITS(); state->mode = DICT; fallthrough; case DICT: if (state->havedict == 0) { RESTORE(); return Z_NEED_DICT; } strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = TYPE; fallthrough; case TYPE: if (flush == Z_BLOCK) goto inf_leave; fallthrough; case TYPEDO: INFLATE_TYPEDO_HOOK(strm, flush); if (state->last) { BYTEBITS(); state->mode = CHECK; break; } NEEDBITS(3); state->last = BITS(1); DROPBITS(1); switch (BITS(2)) { case 0: /* stored block */ state->mode = STORED; break; case 1: /* fixed block */ zlib_fixedtables(state); state->mode = LEN; /* decode codes */ break; case 2: /* dynamic block */ state->mode = TABLE; break; case 3: strm->msg = (char *)"invalid block type"; state->mode = BAD; } DROPBITS(2); break; case STORED: BYTEBITS(); /* go to byte boundary */ NEEDBITS(32); if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { strm->msg = (char *)"invalid stored block lengths"; state->mode = BAD; break; } state->length = (unsigned)hold & 0xffff; INITBITS(); state->mode = COPY; fallthrough; case COPY: copy = state->length; if (copy) { if (copy > have) copy = have; if (copy > left) copy = left; if (copy == 0) goto inf_leave; memcpy(put, next, copy); have -= copy; next += copy; left -= copy; put += copy; state->length -= copy; break; } state->mode = TYPE; break; case TABLE: NEEDBITS(14); state->nlen = BITS(5) + 257; DROPBITS(5); state->ndist = BITS(5) + 1; DROPBITS(5); state->ncode = BITS(4) + 4; DROPBITS(4); #ifndef PKZIP_BUG_WORKAROUND if (state->nlen > 286 || state->ndist > 30) { strm->msg = (char *)"too many length or distance symbols"; state->mode = BAD; break; } #endif state->have = 0; state->mode = LENLENS; fallthrough; case LENLENS: while (state->have < state->ncode) { NEEDBITS(3); state->lens[order[state->have++]] = (unsigned short)BITS(3); DROPBITS(3); } while (state->have < 19) state->lens[order[state->have++]] = 0; state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 7; ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid code lengths set"; state->mode = BAD; break; } state->have = 0; state->mode = CODELENS; fallthrough; case CODELENS: while (state->have < state->nlen + state->ndist) { for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.val < 16) { NEEDBITS(this.bits); DROPBITS(this.bits); state->lens[state->have++] = this.val; } else { if (this.val == 16) { NEEDBITS(this.bits + 2); DROPBITS(this.bits); if (state->have == 0) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } len = state->lens[state->have - 1]; copy = 3 + BITS(2); DROPBITS(2); } else if (this.val == 17) { NEEDBITS(this.bits + 3); DROPBITS(this.bits); len = 0; copy = 3 + BITS(3); DROPBITS(3); } else { NEEDBITS(this.bits + 7); DROPBITS(this.bits); len = 0; copy = 11 + BITS(7); DROPBITS(7); } if (state->have + copy > state->nlen + state->ndist) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } while (copy--) state->lens[state->have++] = (unsigned short)len; } } /* handle error breaks in while */ if (state->mode == BAD) break; /* build code tables */ state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 9; ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid literal/lengths set"; state->mode = BAD; break; } state->distcode = (code const *)(state->next); state->distbits = 6; ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist, &(state->next), &(state->distbits), state->work); if (ret) { strm->msg = (char *)"invalid distances set"; state->mode = BAD; break; } state->mode = LEN; fallthrough; case LEN: if (have >= 6 && left >= 258) { RESTORE(); inflate_fast(strm, out); LOAD(); break; } for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.op && (this.op & 0xf0) == 0) { last = this; for (;;) { this = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); state->length = (unsigned)this.val; if ((int)(this.op) == 0) { state->mode = LIT; break; } if (this.op & 32) { state->mode = TYPE; break; } if (this.op & 64) { strm->msg = (char *)"invalid literal/length code"; state->mode = BAD; break; } state->extra = (unsigned)(this.op) & 15; state->mode = LENEXT; fallthrough; case LENEXT: if (state->extra) { NEEDBITS(state->extra); state->length += BITS(state->extra); DROPBITS(state->extra); } state->mode = DIST; fallthrough; case DIST: for (;;) { this = state->distcode[BITS(state->distbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if ((this.op & 0xf0) == 0) { last = this; for (;;) { this = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); if (this.op & 64) { strm->msg = (char *)"invalid distance code"; state->mode = BAD; break; } state->offset = (unsigned)this.val; state->extra = (unsigned)(this.op) & 15; state->mode = DISTEXT; fallthrough; case DISTEXT: if (state->extra) { NEEDBITS(state->extra); state->offset += BITS(state->extra); DROPBITS(state->extra); } #ifdef INFLATE_STRICT if (state->offset > state->dmax) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif if (state->offset > state->whave + out - left) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } state->mode = MATCH; fallthrough; case MATCH: if (left == 0) goto inf_leave; copy = out - left; if (state->offset > copy) { /* copy from window */ copy = state->offset - copy; if (copy > state->write) { copy -= state->write; from = state->window + (state->wsize - copy); } else from = state->window + (state->write - copy); if (copy > state->length) copy = state->length; } else { /* copy from output */ from = put - state->offset; copy = state->length; } if (copy > left) copy = left; left -= copy; state->length -= copy; do { *put++ = *from++; } while (--copy); if (state->length == 0) state->mode = LEN; break; case LIT: if (left == 0) goto inf_leave; *put++ = (unsigned char)(state->length); left--; state->mode = LEN; break; case CHECK: if (state->wrap) { NEEDBITS(32); out -= left; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && out) strm->adler = state->check = UPDATE(state->check, put - out, out); out = left; if (( REVERSE(hold)) != state->check) { strm->msg = (char *)"incorrect data check"; state->mode = BAD; break; } INITBITS(); } state->mode = DONE; fallthrough; case DONE: ret = Z_STREAM_END; goto inf_leave; case BAD: ret = Z_DATA_ERROR; goto inf_leave; case MEM: return Z_MEM_ERROR; case SYNC: default: return Z_STREAM_ERROR; } /* Return from inflate(), updating the total counts and the check value. If there was no progress during the inflate() call, return a buffer error. Call zlib_updatewindow() to create and/or update the window state. */ inf_leave: RESTORE(); if (INFLATE_NEED_UPDATEWINDOW(strm) && (state->wsize || (state->mode < CHECK && out != strm->avail_out))) zlib_updatewindow(strm, out); in -= strm->avail_in; out -= strm->avail_out; strm->total_in += in; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out) strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); strm->data_type = state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0); if (flush == Z_PACKET_FLUSH && ret == Z_OK && strm->avail_out != 0 && strm->avail_in == 0) return zlib_inflateSyncPacket(strm); if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) ret = Z_BUF_ERROR; return ret; } int zlib_inflateEnd(z_streamp strm) { if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; return Z_OK; } /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; * i.e. no pending output but this should always be the case. The state must * be waiting on the start of a block (i.e. mode == TYPE or HEAD). On exit, * the output will also be caught up, and the checksum will have been updated * if need be. */ int zlib_inflateIncomp(z_stream *z) { struct inflate_state *state = (struct inflate_state *)z->state; Byte *saved_no = z->next_out; uInt saved_ao = z->avail_out; if (state->mode != TYPE && state->mode != HEAD) return Z_DATA_ERROR; /* Setup some variables to allow misuse of updateWindow */ z->avail_out = 0; z->next_out = (unsigned char*)z->next_in + z->avail_in; zlib_updatewindow(z, z->avail_in); /* Restore saved variables */ z->avail_out = saved_ao; z->next_out = saved_no; z->adler = state->check = UPDATE(state->check, z->next_in, z->avail_in); z->total_out += z->avail_in; z->total_in += z->avail_in; z->next_in += z->avail_in; state->total += z->avail_in; z->avail_in = 0; return Z_OK; }
758 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/signalfd.h * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * */ #ifndef _LINUX_SIGNALFD_H #define _LINUX_SIGNALFD_H #include <uapi/linux/signalfd.h> #include <linux/sched/signal.h> #ifdef CONFIG_SIGNALFD /* * Deliver the signal to listening signalfd. */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh))) wake_up(&tsk->sighand->signalfd_wqh); } extern void signalfd_cleanup(struct sighand_struct *sighand); #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } static inline void signalfd_cleanup(struct sighand_struct *sighand) { } #endif /* CONFIG_SIGNALFD */ #endif /* _LINUX_SIGNALFD_H */
1 2 3 5 16 16 52 48 45 5 3 45 1 34 10 9 2 2 3 4 1 3 5 3 2 3 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 // SPDX-License-Identifier: GPL-2.0 /* * linux/drivers/char/mem.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Added devfs support. * Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu> * Shared /dev/zero mmapping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com> */ #include <linux/mm.h> #include <linux/miscdevice.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mman.h> #include <linux/random.h> #include <linux/init.h> #include <linux/tty.h> #include <linux/capability.h> #include <linux/ptrace.h> #include <linux/device.h> #include <linux/highmem.h> #include <linux/backing-dev.h> #include <linux/shmem_fs.h> #include <linux/splice.h> #include <linux/pfn.h> #include <linux/export.h> #include <linux/io.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <linux/security.h> #define DEVMEM_MINOR 1 #define DEVPORT_MINOR 4 static inline unsigned long size_inside_page(unsigned long start, unsigned long size) { unsigned long sz; sz = PAGE_SIZE - (start & (PAGE_SIZE - 1)); return min(sz, size); } #ifndef ARCH_HAS_VALID_PHYS_ADDR_RANGE static inline int valid_phys_addr_range(phys_addr_t addr, size_t count) { return addr + count <= __pa(high_memory); } static inline int valid_mmap_phys_addr_range(unsigned long pfn, size_t size) { return 1; } #endif #ifdef CONFIG_STRICT_DEVMEM static inline int page_is_allowed(unsigned long pfn) { return devmem_is_allowed(pfn); } #else static inline int page_is_allowed(unsigned long pfn) { return 1; } #endif static inline bool should_stop_iteration(void) { if (need_resched()) cond_resched(); return signal_pending(current); } /* * This funcion reads the *physical* memory. The f_pos points directly to the * memory location. */ static ssize_t read_mem(struct file *file, char __user *buf, size_t count, loff_t *ppos) { phys_addr_t p = *ppos; ssize_t read, sz; void *ptr; char *bounce; int err; if (p != *ppos) return 0; if (!valid_phys_addr_range(p, count)) return -EFAULT; read = 0; #ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED /* we don't have page 0 mapped on sparc and m68k.. */ if (p < PAGE_SIZE) { sz = size_inside_page(p, count); if (sz > 0) { if (clear_user(buf, sz)) return -EFAULT; buf += sz; p += sz; count -= sz; read += sz; } } #endif bounce = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!bounce) return -ENOMEM; while (count > 0) { unsigned long remaining; int allowed, probe; sz = size_inside_page(p, count); err = -EPERM; allowed = page_is_allowed(p >> PAGE_SHIFT); if (!allowed) goto failed; err = -EFAULT; if (allowed == 2) { /* Show zeros for restricted memory. */ remaining = clear_user(buf, sz); } else { /* * On ia64 if a page has been mapped somewhere as * uncached, then it must also be accessed uncached * by the kernel or data corruption may occur. */ ptr = xlate_dev_mem_ptr(p); if (!ptr) goto failed; probe = copy_from_kernel_nofault(bounce, ptr, sz); unxlate_dev_mem_ptr(p, ptr); if (probe) goto failed; remaining = copy_to_user(buf, bounce, sz); } if (remaining) goto failed; buf += sz; p += sz; count -= sz; read += sz; if (should_stop_iteration()) break; } kfree(bounce); *ppos += read; return read; failed: kfree(bounce); return err; } static ssize_t write_mem(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { phys_addr_t p = *ppos; ssize_t written, sz; unsigned long copied; void *ptr; if (p != *ppos) return -EFBIG; if (!valid_phys_addr_range(p, count)) return -EFAULT; written = 0; #ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED /* we don't have page 0 mapped on sparc and m68k.. */ if (p < PAGE_SIZE) { sz = size_inside_page(p, count); /* Hmm. Do something? */ buf += sz; p += sz; count -= sz; written += sz; } #endif while (count > 0) { int allowed; sz = size_inside_page(p, count); allowed = page_is_allowed(p >> PAGE_SHIFT); if (!allowed) return -EPERM; /* Skip actual writing when a page is marked as restricted. */ if (allowed == 1) { /* * On ia64 if a page has been mapped somewhere as * uncached, then it must also be accessed uncached * by the kernel or data corruption may occur. */ ptr = xlate_dev_mem_ptr(p); if (!ptr) { if (written) break; return -EFAULT; } copied = copy_from_user(ptr, buf, sz); unxlate_dev_mem_ptr(p, ptr); if (copied) { written += sz - copied; if (written) break; return -EFAULT; } } buf += sz; p += sz; count -= sz; written += sz; if (should_stop_iteration()) break; } *ppos += written; return written; } int __weak phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { return 1; } #ifndef __HAVE_PHYS_MEM_ACCESS_PROT /* * Architectures vary in how they handle caching for addresses * outside of main memory. * */ #ifdef pgprot_noncached static int uncached_access(struct file *file, phys_addr_t addr) { /* * Accessing memory above the top the kernel knows about or through a * file pointer * that was marked O_DSYNC will be done non-cached. */ if (file->f_flags & O_DSYNC) return 1; return addr >= __pa(high_memory); } #endif static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { #ifdef pgprot_noncached phys_addr_t offset = pfn << PAGE_SHIFT; if (uncached_access(file, offset)) return pgprot_noncached(vma_prot); #endif return vma_prot; } #endif #ifndef CONFIG_MMU static unsigned long get_unmapped_area_mem(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { if (!valid_mmap_phys_addr_range(pgoff, len)) return (unsigned long) -EINVAL; return pgoff << PAGE_SHIFT; } /* permit direct mmap, for read, write or exec */ static unsigned memory_mmap_capabilities(struct file *file) { return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC; } static unsigned zero_mmap_capabilities(struct file *file) { return NOMMU_MAP_COPY; } /* can't do an in-place private mapping if there's no MMU */ static inline int private_mapping_ok(struct vm_area_desc *desc) { return is_nommu_shared_mapping(desc->vm_flags); } #else static inline int private_mapping_ok(struct vm_area_desc *desc) { return 1; } #endif static const struct vm_operations_struct mmap_mem_ops = { #ifdef CONFIG_HAVE_IOREMAP_PROT .access = generic_access_phys #endif }; static int mmap_filter_error(int err) { return -EAGAIN; } static int mmap_mem_prepare(struct vm_area_desc *desc) { struct file *file = desc->file; const size_t size = vma_desc_size(desc); const phys_addr_t offset = (phys_addr_t)desc->pgoff << PAGE_SHIFT; /* Does it even fit in phys_addr_t? */ if (offset >> PAGE_SHIFT != desc->pgoff) return -EINVAL; /* It's illegal to wrap around the end of the physical address space. */ if (offset + (phys_addr_t)size - 1 < offset) return -EINVAL; if (!valid_mmap_phys_addr_range(desc->pgoff, size)) return -EINVAL; if (!private_mapping_ok(desc)) return -ENOSYS; if (!range_is_allowed(desc->pgoff, size)) return -EPERM; if (!phys_mem_access_prot_allowed(file, desc->pgoff, size, &desc->page_prot)) return -EINVAL; desc->page_prot = phys_mem_access_prot(file, desc->pgoff, size, desc->page_prot); desc->vm_ops = &mmap_mem_ops; /* Remap-pfn-range will mark the range VM_IO. */ mmap_action_remap_full(desc, desc->pgoff); /* We filter remap errors to -EAGAIN. */ desc->action.error_hook = mmap_filter_error; return 0; } #ifdef CONFIG_DEVPORT static ssize_t read_port(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long i = *ppos; char __user *tmp = buf; if (!access_ok(buf, count)) return -EFAULT; while (count-- > 0 && i < 65536) { if (__put_user(inb(i), tmp) < 0) return -EFAULT; i++; tmp++; } *ppos = i; return tmp-buf; } static ssize_t write_port(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { unsigned long i = *ppos; const char __user *tmp = buf; if (!access_ok(buf, count)) return -EFAULT; while (count-- > 0 && i < 65536) { char c; if (__get_user(c, tmp)) { if (tmp > buf) break; return -EFAULT; } outb(c, i); i++; tmp++; } *ppos = i; return tmp-buf; } #endif static ssize_t read_null(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return 0; } static ssize_t write_null(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return count; } static ssize_t read_iter_null(struct kiocb *iocb, struct iov_iter *to) { return 0; } static ssize_t write_iter_null(struct kiocb *iocb, struct iov_iter *from) { size_t count = iov_iter_count(from); iov_iter_advance(from, count); return count; } static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf, struct splice_desc *sd) { return sd->len; } static ssize_t splice_write_null(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_null); } static int uring_cmd_null(struct io_uring_cmd *ioucmd, unsigned int issue_flags) { return 0; } static ssize_t read_iter_zero(struct kiocb *iocb, struct iov_iter *iter) { size_t written = 0; while (iov_iter_count(iter)) { size_t chunk = iov_iter_count(iter), n; if (chunk > PAGE_SIZE) chunk = PAGE_SIZE; /* Just for latency reasons */ n = iov_iter_zero(chunk, iter); if (!n && iov_iter_count(iter)) return written ? written : -EFAULT; written += n; if (signal_pending(current)) return written ? written : -ERESTARTSYS; if (!need_resched()) continue; if (iocb->ki_flags & IOCB_NOWAIT) return written ? written : -EAGAIN; cond_resched(); } return written; } static ssize_t read_zero(struct file *file, char __user *buf, size_t count, loff_t *ppos) { size_t cleared = 0; while (count) { size_t chunk = min_t(size_t, count, PAGE_SIZE); size_t left; left = clear_user(buf + cleared, chunk); if (unlikely(left)) { cleared += (chunk - left); if (!cleared) return -EFAULT; break; } cleared += chunk; count -= chunk; if (signal_pending(current)) break; cond_resched(); } return cleared; } static int mmap_zero_private_success(const struct vm_area_struct *vma) { /* * This is a highly unique situation where we mark a MAP_PRIVATE mapping * of /dev/zero anonymous, despite it not being. */ vma_set_anonymous((struct vm_area_struct *)vma); return 0; } static int mmap_zero_prepare(struct vm_area_desc *desc) { #ifndef CONFIG_MMU return -ENOSYS; #endif if (desc->vm_flags & VM_SHARED) return shmem_zero_setup_desc(desc); desc->action.success_hook = mmap_zero_private_success; return 0; } #ifndef CONFIG_MMU static unsigned long get_unmapped_area_zero(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return -ENOSYS; } #else static unsigned long get_unmapped_area_zero(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { if (flags & MAP_SHARED) { /* * mmap_zero_prepare() will call shmem_zero_setup() to create a * file, so use shmem's get_unmapped_area in case it can be * huge; and pass NULL for file as in mmap.c's * get_unmapped_area(), so as not to confuse shmem with our * handle on "/dev/zero". */ return shmem_get_unmapped_area(NULL, addr, len, pgoff, flags); } /* * Otherwise flags & MAP_PRIVATE: with no shmem object beneath it, * attempt to map aligned to huge page size if possible, otherwise we * fall back to system page size mappings. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE return thp_get_unmapped_area(file, addr, len, pgoff, flags); #else return mm_get_unmapped_area(file, addr, len, pgoff, flags); #endif } #endif /* CONFIG_MMU */ static ssize_t write_full(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return -ENOSPC; } /* * Special lseek() function for /dev/null and /dev/zero. Most notably, you * can fopen() both devices with "a" now. This was previously impossible. * -- SRB. */ static loff_t null_lseek(struct file *file, loff_t offset, int orig) { return file->f_pos = 0; } /* * The memory devices use the full 32/64 bits of the offset, and so we cannot * check against negative addresses: they are ok. The return value is weird, * though, in that case (0). * * also note that seeking relative to the "end of file" isn't supported: * it has no meaning, so it returns -EINVAL. */ static loff_t memory_lseek(struct file *file, loff_t offset, int orig) { loff_t ret; inode_lock(file_inode(file)); switch (orig) { case SEEK_CUR: offset += file->f_pos; fallthrough; case SEEK_SET: /* to avoid userland mistaking f_pos=-9 as -EBADF=-9 */ if ((unsigned long long)offset >= -MAX_ERRNO) { ret = -EOVERFLOW; break; } file->f_pos = offset; ret = file->f_pos; force_successful_syscall_return(); break; default: ret = -EINVAL; } inode_unlock(file_inode(file)); return ret; } static int open_port(struct inode *inode, struct file *filp) { int rc; if (!capable(CAP_SYS_RAWIO)) return -EPERM; rc = security_locked_down(LOCKDOWN_DEV_MEM); if (rc) return rc; if (iminor(inode) != DEVMEM_MINOR) return 0; /* * Use a unified address space to have a single point to manage * revocations when drivers want to take over a /dev/mem mapped * range. */ filp->f_mapping = iomem_get_mapping(); return 0; } #define zero_lseek null_lseek #define full_lseek null_lseek #define write_zero write_null #define write_iter_zero write_iter_null #define splice_write_zero splice_write_null #define open_mem open_port static const struct file_operations __maybe_unused mem_fops = { .llseek = memory_lseek, .read = read_mem, .write = write_mem, .mmap_prepare = mmap_mem_prepare, .open = open_mem, #ifndef CONFIG_MMU .get_unmapped_area = get_unmapped_area_mem, .mmap_capabilities = memory_mmap_capabilities, #endif .fop_flags = FOP_UNSIGNED_OFFSET, }; static const struct file_operations null_fops = { .llseek = null_lseek, .read = read_null, .write = write_null, .read_iter = read_iter_null, .write_iter = write_iter_null, .splice_write = splice_write_null, .uring_cmd = uring_cmd_null, }; #ifdef CONFIG_DEVPORT static const struct file_operations port_fops = { .llseek = memory_lseek, .read = read_port, .write = write_port, .open = open_port, }; #endif static const struct file_operations zero_fops = { .llseek = zero_lseek, .write = write_zero, .read_iter = read_iter_zero, .read = read_zero, .write_iter = write_iter_zero, .splice_read = copy_splice_read, .splice_write = splice_write_zero, .mmap_prepare = mmap_zero_prepare, .get_unmapped_area = get_unmapped_area_zero, #ifndef CONFIG_MMU .mmap_capabilities = zero_mmap_capabilities, #endif }; static const struct file_operations full_fops = { .llseek = full_lseek, .read_iter = read_iter_zero, .write = write_full, .splice_read = copy_splice_read, }; static const struct memdev { const char *name; const struct file_operations *fops; fmode_t fmode; umode_t mode; } devlist[] = { #ifdef CONFIG_DEVMEM [DEVMEM_MINOR] = { "mem", &mem_fops, 0, 0 }, #endif [3] = { "null", &null_fops, FMODE_NOWAIT, 0666 }, #ifdef CONFIG_DEVPORT [4] = { "port", &port_fops, 0, 0 }, #endif [5] = { "zero", &zero_fops, FMODE_NOWAIT, 0666 }, [7] = { "full", &full_fops, 0, 0666 }, [8] = { "random", &random_fops, FMODE_NOWAIT, 0666 }, [9] = { "urandom", &urandom_fops, FMODE_NOWAIT, 0666 }, #ifdef CONFIG_PRINTK [11] = { "kmsg", &kmsg_fops, 0, 0644 }, #endif }; static int memory_open(struct inode *inode, struct file *filp) { int minor; const struct memdev *dev; minor = iminor(inode); if (minor >= ARRAY_SIZE(devlist)) return -ENXIO; dev = &devlist[minor]; if (!dev->fops) return -ENXIO; filp->f_op = dev->fops; filp->f_mode |= dev->fmode; if (dev->fops->open) return dev->fops->open(inode, filp); return 0; } static const struct file_operations memory_fops = { .open = memory_open, .llseek = noop_llseek, }; static char *mem_devnode(const struct device *dev, umode_t *mode) { if (mode && devlist[MINOR(dev->devt)].mode) *mode = devlist[MINOR(dev->devt)].mode; return NULL; } static const struct class mem_class = { .name = "mem", .devnode = mem_devnode, }; static int __init chr_dev_init(void) { int retval; int minor; if (register_chrdev(MEM_MAJOR, "mem", &memory_fops)) printk("unable to get major %d for memory devs\n", MEM_MAJOR); retval = class_register(&mem_class); if (retval) return retval; for (minor = 1; minor < ARRAY_SIZE(devlist); minor++) { if (!devlist[minor].name) continue; /* * Create /dev/port? */ if ((minor == DEVPORT_MINOR) && !arch_has_dev_port()) continue; device_create(&mem_class, NULL, MKDEV(MEM_MAJOR, minor), NULL, devlist[minor].name); } return tty_init(); } fs_initcall(chr_dev_init);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0-only /* * crash.c - kernel crash support code. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> */ #include <linux/buildid.h> #include <linux/init.h> #include <linux/utsname.h> #include <linux/vmalloc.h> #include <linux/sizes.h> #include <linux/kexec.h> #include <linux/memory.h> #include <linux/cpuhotplug.h> #include <linux/memblock.h> #include <linux/kmemleak.h> #include <asm/page.h> #include <asm/sections.h> #include <crypto/sha1.h> #include "kallsyms_internal.h" #include "kexec_internal.h" /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; struct hwerr_info { atomic_t count; time64_t timestamp; }; static struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { struct elf_note *note = (struct elf_note *)buf; note->n_namesz = strlen(name) + 1; note->n_descsz = data_len; note->n_type = type; buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); memcpy(buf, name, note->n_namesz); buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); memcpy(buf, data, data_len); buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); return buf; } void final_note(Elf_Word *buf) { memset(buf, 0, sizeof(struct elf_note)); } static void update_vmcoreinfo_note(void) { u32 *buf = vmcoreinfo_note; if (!vmcoreinfo_size) return; buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, vmcoreinfo_size); final_note(buf); } void crash_update_vmcoreinfo_safecopy(void *ptr) { if (ptr) memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); vmcoreinfo_data_safecopy = ptr; } void crash_save_vmcoreinfo(void) { if (!vmcoreinfo_note) return; /* Use the safe copy to generate vmcoreinfo note if have */ if (vmcoreinfo_data_safecopy) vmcoreinfo_data = vmcoreinfo_data_safecopy; vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); update_vmcoreinfo_note(); } void vmcoreinfo_append_str(const char *fmt, ...) { va_list args; char buf[0x50]; size_t r; va_start(args, fmt); r = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); vmcoreinfo_size += r; WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES, "vmcoreinfo data exceeds allocated size, truncating"); } /* * provide an empty default implementation here -- architecture * code may override this */ void __weak arch_crash_save_vmcoreinfo(void) {} phys_addr_t __weak paddr_vmcoreinfo_note(void) { return __pa(vmcoreinfo_note); } EXPORT_SYMBOL(paddr_vmcoreinfo_note); void hwerr_log_error_type(enum hwerr_error_type src) { if (src < 0 || src >= HWERR_RECOV_MAX) return; atomic_inc(&hwerr_data[src].count); WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds()); } EXPORT_SYMBOL_GPL(hwerr_log_error_type); static int __init crash_save_vmcoreinfo_init(void) { vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); if (!vmcoreinfo_data) { pr_warn("Memory allocation for vmcoreinfo_data failed\n"); return -ENOMEM; } vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, GFP_KERNEL | __GFP_ZERO); if (!vmcoreinfo_note) { free_page((unsigned long)vmcoreinfo_data); vmcoreinfo_data = NULL; pr_warn("Memory allocation for vmcoreinfo_note failed\n"); return -ENOMEM; } VMCOREINFO_OSRELEASE(init_uts_ns.name.release); VMCOREINFO_BUILD_ID(); VMCOREINFO_PAGESIZE(PAGE_SIZE); VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_OFFSET(uts_namespace, name); VMCOREINFO_SYMBOL(node_online_map); #ifdef CONFIG_MMU VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START); #ifndef CONFIG_NUMA VMCOREINFO_SYMBOL(mem_map); VMCOREINFO_SYMBOL(contig_page_data); #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP VMCOREINFO_SYMBOL_ARRAY(vmemmap); #endif #ifdef CONFIG_SPARSEMEM VMCOREINFO_SYMBOL_ARRAY(mem_section); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); VMCOREINFO_NUMBER(SECTION_SIZE_BITS); VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); VMCOREINFO_STRUCT_SIZE(pglist_data); VMCOREINFO_STRUCT_SIZE(zone); VMCOREINFO_STRUCT_SIZE(free_area); VMCOREINFO_STRUCT_SIZE(list_head); VMCOREINFO_SIZE(nodemask_t); VMCOREINFO_OFFSET(page, flags); VMCOREINFO_OFFSET(page, _refcount); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLATMEM VMCOREINFO_OFFSET(pglist_data, node_mem_map); #endif VMCOREINFO_OFFSET(pglist_data, node_start_pfn); VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); VMCOREINFO_OFFSET(pglist_data, node_id); VMCOREINFO_OFFSET(zone, free_area); VMCOREINFO_OFFSET(zone, vm_stat); VMCOREINFO_OFFSET(zone, spanned_pages); VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); log_buf_vmcoreinfo_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); VMCOREINFO_NUMBER(PG_lru); VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); VMCOREINFO_NUMBER(PG_swapbacked); #define PAGE_SLAB_MAPCOUNT_VALUE (PGTY_slab << 24) VMCOREINFO_NUMBER(PAGE_SLAB_MAPCOUNT_VALUE); #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); #define PAGE_BUDDY_MAPCOUNT_VALUE (PGTY_buddy << 24) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #define PAGE_HUGETLB_MAPCOUNT_VALUE (PGTY_hugetlb << 24) VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); #define PAGE_OFFLINE_MAPCOUNT_VALUE (PGTY_offline << 24) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #ifdef CONFIG_UNACCEPTED_MEMORY #define PAGE_UNACCEPTED_MAPCOUNT_VALUE (PGTY_unaccepted << 24) VMCOREINFO_NUMBER(PAGE_UNACCEPTED_MAPCOUNT_VALUE); #endif #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); VMCOREINFO_SYMBOL(kallsyms_num_syms); VMCOREINFO_SYMBOL(kallsyms_token_table); VMCOREINFO_SYMBOL(kallsyms_token_index); VMCOREINFO_SYMBOL(kallsyms_offsets); VMCOREINFO_SYMBOL(kallsyms_relative_base); #endif /* CONFIG_KALLSYMS */ arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); return 0; } subsys_initcall(crash_save_vmcoreinfo_init);
5421 5417 2914 2914 137 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/dir.c - sysfs core and dir operation implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #define pr_fmt(fmt) "sysfs: " fmt #include <linux/fs.h> #include <linux/kobject.h> #include <linux/slab.h> #include "sysfs.h" DEFINE_SPINLOCK(sysfs_symlink_target_lock); void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { char *buf; buf = kzalloc(PATH_MAX, GFP_KERNEL); if (buf) kernfs_path(parent, buf, PATH_MAX); pr_warn("cannot create duplicate filename '%s/%s'\n", buf, name); dump_stack(); kfree(buf); } /** * sysfs_create_dir_ns - create a directory for an object with a namespace tag * @kobj: object we're creating directory for * @ns: the namespace tag to use */ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { struct kernfs_node *parent, *kn; kuid_t uid; kgid_t gid; if (WARN_ON(!kobj)) return -EINVAL; if (kobj->parent) parent = kobj->parent->sd; else parent = sysfs_root_kn; if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); kn = kernfs_create_dir_ns(parent, kobject_name(kobj), 0755, uid, gid, kobj, ns); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, kobject_name(kobj)); return PTR_ERR(kn); } kobj->sd = kn; return 0; } /** * sysfs_remove_dir - remove an object's directory. * @kobj: object. * * The only thing special about this is that we remove any files in * the directory before we remove the directory, and we've inlined * what used to be sysfs_rmdir() below, instead of calling separately. */ void sysfs_remove_dir(struct kobject *kobj) { struct kernfs_node *kn = kobj->sd; /* * In general, kobject owner is responsible for ensuring removal * doesn't race with other operations and sysfs doesn't provide any * protection; however, when @kobj is used as a symlink target, the * symlinking entity usually doesn't own @kobj and thus has no * control over removal. @kobj->sd may be removed anytime * and symlink code may end up dereferencing an already freed node. * * sysfs_symlink_target_lock synchronizes @kobj->sd * disassociation against symlink operations so that symlink code * can safely dereference @kobj->sd. */ spin_lock(&sysfs_symlink_target_lock); kobj->sd = NULL; spin_unlock(&sysfs_symlink_target_lock); if (kn) { WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); kernfs_remove(kn); } } int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { struct kernfs_node *parent; int ret; parent = kernfs_get_parent(kobj->sd); ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); kernfs_put(parent); return ret; } int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { struct kernfs_node *kn = kobj->sd; struct kernfs_node *new_parent; new_parent = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : sysfs_root_kn; return kernfs_rename_ns(kn, new_parent, NULL, new_ns); } /** * sysfs_create_mount_point - create an always empty directory * @parent_kobj: kobject that will contain this always empty directory * @name: The name of the always empty directory to add */ int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { struct kernfs_node *kn, *parent = parent_kobj->sd; kn = kernfs_create_empty_dir(parent, name); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, name); return PTR_ERR(kn); } return 0; } EXPORT_SYMBOL_GPL(sysfs_create_mount_point); /** * sysfs_remove_mount_point - remove an always empty directory. * @parent_kobj: kobject that will contain this always empty directory * @name: The name of the always empty directory to remove * */ void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { struct kernfs_node *parent = parent_kobj->sd; kernfs_remove_by_name_ns(parent, name, NULL); } EXPORT_SYMBOL_GPL(sysfs_remove_mount_point);
6 6 9 16 13 7 1 3 2 13 9 37 15 10 10 6 6 6 6 6 6 17 9 9 9 10 9 9 9 9 9 9 10 3 8 4 6 10 10 5 5 1 4 5 16 16 16 2 4 9 9 16 15 5 5 9 6 96 34 35 112 112 1 1 1 1 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright(c) 1999 - 2004 Intel Corporation. All rights reserved. */ #include <linux/skbuff.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/if_bonding.h> #include <linux/pkt_sched.h> #include <net/net_namespace.h> #include <net/bonding.h> #include <net/bond_3ad.h> #include <net/netlink.h> /* General definitions */ #define AD_SHORT_TIMEOUT 1 #define AD_LONG_TIMEOUT 0 #define AD_STANDBY 0x2 #define AD_MAX_TX_IN_SECOND 3 #define AD_COLLECTOR_MAX_DELAY 0 /* Timer definitions (43.4.4 in the 802.3ad standard) */ #define AD_FAST_PERIODIC_TIME 1 #define AD_SLOW_PERIODIC_TIME 30 #define AD_SHORT_TIMEOUT_TIME (3*AD_FAST_PERIODIC_TIME) #define AD_LONG_TIMEOUT_TIME (3*AD_SLOW_PERIODIC_TIME) #define AD_CHURN_DETECTION_TIME 60 #define AD_AGGREGATE_WAIT_TIME 2 /* Port Variables definitions used by the State Machines (43.4.7 in the * 802.3ad standard) */ #define AD_PORT_BEGIN 0x1 #define AD_PORT_LACP_ENABLED 0x2 #define AD_PORT_ACTOR_CHURN 0x4 #define AD_PORT_PARTNER_CHURN 0x8 #define AD_PORT_READY 0x10 #define AD_PORT_READY_N 0x20 #define AD_PORT_MATCHED 0x40 #define AD_PORT_STANDBY 0x80 #define AD_PORT_SELECTED 0x100 #define AD_PORT_MOVED 0x200 #define AD_PORT_CHURNED (AD_PORT_ACTOR_CHURN | AD_PORT_PARTNER_CHURN) /* Port Key definitions * key is determined according to the link speed, duplex and * user key (which is yet not supported) * -------------------------------------------------------------- * Port key | User key (10 bits) | Speed (5 bits) | Duplex| * -------------------------------------------------------------- * |15 6|5 1|0 */ #define AD_DUPLEX_KEY_MASKS 0x1 #define AD_SPEED_KEY_MASKS 0x3E #define AD_USER_KEY_MASKS 0xFFC0 enum ad_link_speed_type { AD_LINK_SPEED_1MBPS = 1, AD_LINK_SPEED_10MBPS, AD_LINK_SPEED_100MBPS, AD_LINK_SPEED_1000MBPS, AD_LINK_SPEED_2500MBPS, AD_LINK_SPEED_5000MBPS, AD_LINK_SPEED_10000MBPS, AD_LINK_SPEED_14000MBPS, AD_LINK_SPEED_20000MBPS, AD_LINK_SPEED_25000MBPS, AD_LINK_SPEED_40000MBPS, AD_LINK_SPEED_50000MBPS, AD_LINK_SPEED_56000MBPS, AD_LINK_SPEED_100000MBPS, AD_LINK_SPEED_200000MBPS, AD_LINK_SPEED_400000MBPS, AD_LINK_SPEED_800000MBPS, AD_LINK_SPEED_1600000MBPS, }; /* compare MAC addresses */ #define MAC_ADDRESS_EQUAL(A, B) \ ether_addr_equal_64bits((const u8 *)A, (const u8 *)B) static const u16 ad_ticks_per_sec = 1000 / AD_TIMER_INTERVAL; static const int ad_delta_in_ticks = (AD_TIMER_INTERVAL * HZ) / 1000; const u8 lacpdu_mcast_addr[ETH_ALEN + 2] __long_aligned = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x02 }; /* ================= main 802.3ad protocol functions ================== */ static int ad_lacpdu_send(struct port *port); static int ad_marker_send(struct port *port, struct bond_marker *marker); static void ad_mux_machine(struct port *port, bool *update_slave_arr); static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port); static void ad_tx_machine(struct port *port); static void ad_periodic_machine(struct port *port); static void ad_port_selection_logic(struct port *port, bool *update_slave_arr); static void ad_agg_selection_logic(struct aggregator *aggregator, bool *update_slave_arr); static void ad_clear_agg(struct aggregator *aggregator); static void ad_initialize_agg(struct aggregator *aggregator); static void ad_initialize_port(struct port *port, const struct bond_params *bond_params); static void ad_enable_collecting(struct port *port); static void ad_disable_distributing(struct port *port, bool *update_slave_arr); static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr); static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr); static void ad_marker_info_received(struct bond_marker *marker_info, struct port *port); static void ad_marker_response_received(struct bond_marker *marker, struct port *port); static void ad_update_actor_keys(struct port *port, bool reset); /* ================= api to bonding and kernel code ================== */ /** * __get_bond_by_port - get the port's bonding struct * @port: the port we're looking at * * Return @port's bonding struct, or %NULL if it can't be found. */ static inline struct bonding *__get_bond_by_port(struct port *port) { if (port->slave == NULL) return NULL; return bond_get_bond_by_slave(port->slave); } /** * __get_first_agg - get the first aggregator in the bond * @port: the port we're looking at * * Return the aggregator of the first slave in @bond, or %NULL if it can't be * found. * The caller must hold RCU or RTNL lock. */ static inline struct aggregator *__get_first_agg(struct port *port) { struct bonding *bond = __get_bond_by_port(port); struct slave *first_slave; struct aggregator *agg; /* If there's no bond for this port, or bond has no slaves */ if (bond == NULL) return NULL; rcu_read_lock(); first_slave = bond_first_slave_rcu(bond); agg = first_slave ? &(SLAVE_AD_INFO(first_slave)->aggregator) : NULL; rcu_read_unlock(); return agg; } /** * __agg_has_partner - see if we have a partner * @agg: the agregator we're looking at * * Return nonzero if aggregator has a partner (denoted by a non-zero ether * address for the partner). Return 0 if not. */ static inline int __agg_has_partner(struct aggregator *agg) { return !is_zero_ether_addr(agg->partner_system.mac_addr_value); } /** * __disable_distributing_port - disable the port's slave for distributing. * Port will still be able to collect. * @port: the port we're looking at * * This will disable only distributing on the port's slave. */ static void __disable_distributing_port(struct port *port) { bond_set_slave_tx_disabled_flags(port->slave, BOND_SLAVE_NOTIFY_LATER); } /** * __enable_collecting_port - enable the port's slave for collecting, * if it's up * @port: the port we're looking at * * This will enable only collecting on the port's slave. */ static void __enable_collecting_port(struct port *port) { struct slave *slave = port->slave; if (slave->link == BOND_LINK_UP && bond_slave_is_up(slave)) bond_set_slave_rx_enabled_flags(slave, BOND_SLAVE_NOTIFY_LATER); } /** * __disable_port - disable the port's slave * @port: the port we're looking at * * This will disable both collecting and distributing on the port's slave. */ static inline void __disable_port(struct port *port) { bond_set_slave_inactive_flags(port->slave, BOND_SLAVE_NOTIFY_LATER); } /** * __enable_port - enable the port's slave, if it's up * @port: the port we're looking at * * This will enable both collecting and distributing on the port's slave. */ static inline void __enable_port(struct port *port) { struct slave *slave = port->slave; if ((slave->link == BOND_LINK_UP) && bond_slave_is_up(slave)) bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_LATER); } /** * __port_move_to_attached_state - check if port should transition back to attached * state. * @port: the port we're looking at */ static bool __port_move_to_attached_state(struct port *port) { if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY) || !(port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) || !(port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) port->sm_mux_state = AD_MUX_ATTACHED; return port->sm_mux_state == AD_MUX_ATTACHED; } /** * __port_is_collecting_distributing - check if the port's slave is in the * combined collecting/distributing state * @port: the port we're looking at */ static int __port_is_collecting_distributing(struct port *port) { return bond_is_active_slave(port->slave); } /** * __get_agg_selection_mode - get the aggregator selection mode * @port: the port we're looking at * * Get the aggregator selection mode. Can be %STABLE, %BANDWIDTH or %COUNT. */ static inline u32 __get_agg_selection_mode(struct port *port) { struct bonding *bond = __get_bond_by_port(port); if (bond == NULL) return BOND_AD_STABLE; return bond->params.ad_select; } /** * __check_agg_selection_timer - check if the selection timer has expired * @port: the port we're looking at */ static inline int __check_agg_selection_timer(struct port *port) { struct bonding *bond = __get_bond_by_port(port); if (bond == NULL) return 0; return atomic_read(&BOND_AD_INFO(bond).agg_select_timer) ? 1 : 0; } /** * __get_link_speed - get a port's speed * @port: the port we're looking at * * Return @port's speed in 802.3ad enum format. i.e. one of: * 0, * %AD_LINK_SPEED_10MBPS, * %AD_LINK_SPEED_100MBPS, * %AD_LINK_SPEED_1000MBPS, * %AD_LINK_SPEED_2500MBPS, * %AD_LINK_SPEED_5000MBPS, * %AD_LINK_SPEED_10000MBPS * %AD_LINK_SPEED_14000MBPS, * %AD_LINK_SPEED_20000MBPS * %AD_LINK_SPEED_25000MBPS * %AD_LINK_SPEED_40000MBPS * %AD_LINK_SPEED_50000MBPS * %AD_LINK_SPEED_56000MBPS * %AD_LINK_SPEED_100000MBPS * %AD_LINK_SPEED_200000MBPS * %AD_LINK_SPEED_400000MBPS * %AD_LINK_SPEED_800000MBPS * %AD_LINK_SPEED_1600000MBPS */ static u16 __get_link_speed(struct port *port) { struct slave *slave = port->slave; u16 speed; /* this if covers only a special case: when the configuration starts * with link down, it sets the speed to 0. * This is done in spite of the fact that the e100 driver reports 0 * to be compatible with MVT in the future. */ if (slave->link != BOND_LINK_UP) speed = 0; else { switch (slave->speed) { case SPEED_10: speed = AD_LINK_SPEED_10MBPS; break; case SPEED_100: speed = AD_LINK_SPEED_100MBPS; break; case SPEED_1000: speed = AD_LINK_SPEED_1000MBPS; break; case SPEED_2500: speed = AD_LINK_SPEED_2500MBPS; break; case SPEED_5000: speed = AD_LINK_SPEED_5000MBPS; break; case SPEED_10000: speed = AD_LINK_SPEED_10000MBPS; break; case SPEED_14000: speed = AD_LINK_SPEED_14000MBPS; break; case SPEED_20000: speed = AD_LINK_SPEED_20000MBPS; break; case SPEED_25000: speed = AD_LINK_SPEED_25000MBPS; break; case SPEED_40000: speed = AD_LINK_SPEED_40000MBPS; break; case SPEED_50000: speed = AD_LINK_SPEED_50000MBPS; break; case SPEED_56000: speed = AD_LINK_SPEED_56000MBPS; break; case SPEED_100000: speed = AD_LINK_SPEED_100000MBPS; break; case SPEED_200000: speed = AD_LINK_SPEED_200000MBPS; break; case SPEED_400000: speed = AD_LINK_SPEED_400000MBPS; break; case SPEED_800000: speed = AD_LINK_SPEED_800000MBPS; break; case SPEED_1600000: speed = AD_LINK_SPEED_1600000MBPS; break; default: /* unknown speed value from ethtool. shouldn't happen */ if (slave->speed != SPEED_UNKNOWN) pr_err_once("%s: (slave %s): unknown ethtool speed (%d) for port %d (set it to 0)\n", slave->bond->dev->name, slave->dev->name, slave->speed, port->actor_port_number); speed = 0; break; } } slave_dbg(slave->bond->dev, slave->dev, "Port %d Received link speed %d update from adapter\n", port->actor_port_number, speed); return speed; } /** * __get_duplex - get a port's duplex * @port: the port we're looking at * * Return @port's duplex in 802.3ad bitmask format. i.e.: * 0x01 if in full duplex * 0x00 otherwise */ static u8 __get_duplex(struct port *port) { struct slave *slave = port->slave; u8 retval = 0x0; /* handling a special case: when the configuration starts with * link down, it sets the duplex to 0. */ if (slave->link == BOND_LINK_UP) { switch (slave->duplex) { case DUPLEX_FULL: retval = 0x1; slave_dbg(slave->bond->dev, slave->dev, "Port %d Received status full duplex update from adapter\n", port->actor_port_number); break; case DUPLEX_HALF: default: retval = 0x0; slave_dbg(slave->bond->dev, slave->dev, "Port %d Received status NOT full duplex update from adapter\n", port->actor_port_number); break; } } return retval; } static void __ad_actor_update_port(struct port *port) { const struct bonding *bond = bond_get_bond_by_slave(port->slave); port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr; port->actor_system_priority = BOND_AD_INFO(bond).system.sys_priority; port->actor_port_priority = SLAVE_AD_INFO(port->slave)->port_priority; } /* Conversions */ /** * __ad_timer_to_ticks - convert a given timer type to AD module ticks * @timer_type: which timer to operate * @par: timer parameter. see below * * If @timer_type is %current_while_timer, @par indicates long/short timer. * If @timer_type is %periodic_timer, @par is one of %FAST_PERIODIC_TIME, * %SLOW_PERIODIC_TIME. */ static u16 __ad_timer_to_ticks(u16 timer_type, u16 par) { u16 retval = 0; /* to silence the compiler */ switch (timer_type) { case AD_CURRENT_WHILE_TIMER: /* for rx machine usage */ if (par) retval = (AD_SHORT_TIMEOUT_TIME*ad_ticks_per_sec); else retval = (AD_LONG_TIMEOUT_TIME*ad_ticks_per_sec); break; case AD_ACTOR_CHURN_TIMER: /* for local churn machine */ retval = (AD_CHURN_DETECTION_TIME*ad_ticks_per_sec); break; case AD_PERIODIC_TIMER: /* for periodic machine */ retval = (par*ad_ticks_per_sec); /* long timeout */ break; case AD_PARTNER_CHURN_TIMER: /* for remote churn machine */ retval = (AD_CHURN_DETECTION_TIME*ad_ticks_per_sec); break; case AD_WAIT_WHILE_TIMER: /* for selection machine */ retval = (AD_AGGREGATE_WAIT_TIME*ad_ticks_per_sec); break; } return retval; } /* ================= ad_rx_machine helper functions ================== */ /** * __choose_matched - update a port's matched variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Update the value of the matched variable, using parameter values from a * newly received lacpdu. Parameter values for the partner carried in the * received PDU are compared with the corresponding operational parameter * values for the actor. Matched is set to TRUE if all of these parameters * match and the PDU parameter partner_state.aggregation has the same value as * actor_oper_port_state.aggregation and lacp will actively maintain the link * in the aggregation. Matched is also set to TRUE if the value of * actor_state.aggregation in the received PDU is set to FALSE, i.e., indicates * an individual link and lacp will actively maintain the link. Otherwise, * matched is set to FALSE. LACP is considered to be actively maintaining the * link if either the PDU's actor_state.lacp_activity variable is TRUE or both * the actor's actor_oper_port_state.lacp_activity and the PDU's * partner_state.lacp_activity variables are TRUE. * * Note: the AD_PORT_MATCHED "variable" is not specified by 802.3ad; it is * used here to implement the language from 802.3ad 43.4.9 that requires * recordPDU to "match" the LACPDU parameters to the stored values. */ static void __choose_matched(struct lacpdu *lacpdu, struct port *port) { /* check if all parameters are alike * or this is individual link(aggregation == FALSE) * then update the state machine Matched variable. */ if (((ntohs(lacpdu->partner_port) == port->actor_port_number) && (ntohs(lacpdu->partner_port_priority) == port->actor_port_priority) && MAC_ADDRESS_EQUAL(&(lacpdu->partner_system), &(port->actor_system)) && (ntohs(lacpdu->partner_system_priority) == port->actor_system_priority) && (ntohs(lacpdu->partner_key) == port->actor_oper_port_key) && ((lacpdu->partner_state & LACP_STATE_AGGREGATION) == (port->actor_oper_port_state & LACP_STATE_AGGREGATION))) || ((lacpdu->actor_state & LACP_STATE_AGGREGATION) == 0) ) { port->sm_vars |= AD_PORT_MATCHED; } else { port->sm_vars &= ~AD_PORT_MATCHED; } } /** * __record_pdu - record parameters from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Record the parameter values for the Actor carried in a received lacpdu as * the current partner operational parameter values and sets * actor_oper_port_state.defaulted to FALSE. */ static void __record_pdu(struct lacpdu *lacpdu, struct port *port) { if (lacpdu && port) { struct port_params *partner = &port->partner_oper; __choose_matched(lacpdu, port); /* record the new parameter values for the partner * operational */ partner->port_number = ntohs(lacpdu->actor_port); partner->port_priority = ntohs(lacpdu->actor_port_priority); partner->system = lacpdu->actor_system; partner->system_priority = ntohs(lacpdu->actor_system_priority); partner->key = ntohs(lacpdu->actor_key); partner->port_state = lacpdu->actor_state; /* set actor_oper_port_state.defaulted to FALSE */ port->actor_oper_port_state &= ~LACP_STATE_DEFAULTED; /* set the partner sync. to on if the partner is sync, * and the port is matched */ if ((port->sm_vars & AD_PORT_MATCHED) && (lacpdu->actor_state & LACP_STATE_SYNCHRONIZATION)) { partner->port_state |= LACP_STATE_SYNCHRONIZATION; slave_dbg(port->slave->bond->dev, port->slave->dev, "partner sync=1\n"); } else { partner->port_state &= ~LACP_STATE_SYNCHRONIZATION; slave_dbg(port->slave->bond->dev, port->slave->dev, "partner sync=0\n"); } } } /** * __record_default - record default parameters * @port: the port we're looking at * * This function records the default parameter values for the partner carried * in the Partner Admin parameters as the current partner operational parameter * values and sets actor_oper_port_state.defaulted to TRUE. */ static void __record_default(struct port *port) { if (port) { /* record the partner admin parameters */ memcpy(&port->partner_oper, &port->partner_admin, sizeof(struct port_params)); /* set actor_oper_port_state.defaulted to true */ port->actor_oper_port_state |= LACP_STATE_DEFAULTED; } } /** * __update_selected - update a port's Selected variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Update the value of the selected variable, using parameter values from a * newly received lacpdu. The parameter values for the Actor carried in the * received PDU are compared with the corresponding operational parameter * values for the ports partner. If one or more of the comparisons shows that * the value(s) received in the PDU differ from the current operational values, * then selected is set to FALSE and actor_oper_port_state.synchronization is * set to out_of_sync. Otherwise, selected remains unchanged. */ static void __update_selected(struct lacpdu *lacpdu, struct port *port) { if (lacpdu && port) { const struct port_params *partner = &port->partner_oper; /* check if any parameter is different then * update the state machine selected variable. */ if (ntohs(lacpdu->actor_port) != partner->port_number || ntohs(lacpdu->actor_port_priority) != partner->port_priority || !MAC_ADDRESS_EQUAL(&lacpdu->actor_system, &partner->system) || ntohs(lacpdu->actor_system_priority) != partner->system_priority || ntohs(lacpdu->actor_key) != partner->key || (lacpdu->actor_state & LACP_STATE_AGGREGATION) != (partner->port_state & LACP_STATE_AGGREGATION)) { port->sm_vars &= ~AD_PORT_SELECTED; } } } /** * __update_default_selected - update a port's Selected variable from Partner * @port: the port we're looking at * * This function updates the value of the selected variable, using the partner * administrative parameter values. The administrative values are compared with * the corresponding operational parameter values for the partner. If one or * more of the comparisons shows that the administrative value(s) differ from * the current operational values, then Selected is set to FALSE and * actor_oper_port_state.synchronization is set to OUT_OF_SYNC. Otherwise, * Selected remains unchanged. */ static void __update_default_selected(struct port *port) { if (port) { const struct port_params *admin = &port->partner_admin; const struct port_params *oper = &port->partner_oper; /* check if any parameter is different then * update the state machine selected variable. */ if (admin->port_number != oper->port_number || admin->port_priority != oper->port_priority || !MAC_ADDRESS_EQUAL(&admin->system, &oper->system) || admin->system_priority != oper->system_priority || admin->key != oper->key || (admin->port_state & LACP_STATE_AGGREGATION) != (oper->port_state & LACP_STATE_AGGREGATION)) { port->sm_vars &= ~AD_PORT_SELECTED; } } } /** * __update_ntt - update a port's ntt variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Updates the value of the ntt variable, using parameter values from a newly * received lacpdu. The parameter values for the partner carried in the * received PDU are compared with the corresponding operational parameter * values for the Actor. If one or more of the comparisons shows that the * value(s) received in the PDU differ from the current operational values, * then ntt is set to TRUE. Otherwise, ntt remains unchanged. */ static void __update_ntt(struct lacpdu *lacpdu, struct port *port) { /* validate lacpdu and port */ if (lacpdu && port) { /* check if any parameter is different then * update the port->ntt. */ if ((ntohs(lacpdu->partner_port) != port->actor_port_number) || (ntohs(lacpdu->partner_port_priority) != port->actor_port_priority) || !MAC_ADDRESS_EQUAL(&(lacpdu->partner_system), &(port->actor_system)) || (ntohs(lacpdu->partner_system_priority) != port->actor_system_priority) || (ntohs(lacpdu->partner_key) != port->actor_oper_port_key) || ((lacpdu->partner_state & LACP_STATE_LACP_ACTIVITY) != (port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY)) || ((lacpdu->partner_state & LACP_STATE_LACP_TIMEOUT) != (port->actor_oper_port_state & LACP_STATE_LACP_TIMEOUT)) || ((lacpdu->partner_state & LACP_STATE_SYNCHRONIZATION) != (port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) || ((lacpdu->partner_state & LACP_STATE_AGGREGATION) != (port->actor_oper_port_state & LACP_STATE_AGGREGATION)) ) { port->ntt = true; } } } /** * __agg_ports_are_ready - check if all ports in an aggregator are ready * @aggregator: the aggregator we're looking at * */ static int __agg_ports_are_ready(struct aggregator *aggregator) { struct port *port; int retval = 1; if (aggregator) { /* scan all ports in this aggregator to verfy if they are * all ready. */ for (port = aggregator->lag_ports; port; port = port->next_port_in_aggregator) { if (!(port->sm_vars & AD_PORT_READY_N)) { retval = 0; break; } } } return retval; } /** * __set_agg_ports_ready - set value of Ready bit in all ports of an aggregator * @aggregator: the aggregator we're looking at * @val: Should the ports' ready bit be set on or off * */ static void __set_agg_ports_ready(struct aggregator *aggregator, int val) { struct port *port; for (port = aggregator->lag_ports; port; port = port->next_port_in_aggregator) { if (val) port->sm_vars |= AD_PORT_READY; else port->sm_vars &= ~AD_PORT_READY; } } static int __agg_active_ports(struct aggregator *agg) { struct port *port; int active = 0; for (port = agg->lag_ports; port; port = port->next_port_in_aggregator) { if (port->is_enabled) active++; } return active; } static unsigned int __agg_ports_priority(const struct aggregator *agg) { struct port *port = agg->lag_ports; unsigned int prio = 0; for (; port; port = port->next_port_in_aggregator) if (port->is_enabled) prio += port->actor_port_priority; return prio; } /** * __get_agg_bandwidth - get the total bandwidth of an aggregator * @aggregator: the aggregator we're looking at * */ static u32 __get_agg_bandwidth(struct aggregator *aggregator) { int nports = __agg_active_ports(aggregator); u32 bandwidth = 0; if (nports) { switch (__get_link_speed(aggregator->lag_ports)) { case AD_LINK_SPEED_1MBPS: bandwidth = nports; break; case AD_LINK_SPEED_10MBPS: bandwidth = nports * 10; break; case AD_LINK_SPEED_100MBPS: bandwidth = nports * 100; break; case AD_LINK_SPEED_1000MBPS: bandwidth = nports * 1000; break; case AD_LINK_SPEED_2500MBPS: bandwidth = nports * 2500; break; case AD_LINK_SPEED_5000MBPS: bandwidth = nports * 5000; break; case AD_LINK_SPEED_10000MBPS: bandwidth = nports * 10000; break; case AD_LINK_SPEED_14000MBPS: bandwidth = nports * 14000; break; case AD_LINK_SPEED_20000MBPS: bandwidth = nports * 20000; break; case AD_LINK_SPEED_25000MBPS: bandwidth = nports * 25000; break; case AD_LINK_SPEED_40000MBPS: bandwidth = nports * 40000; break; case AD_LINK_SPEED_50000MBPS: bandwidth = nports * 50000; break; case AD_LINK_SPEED_56000MBPS: bandwidth = nports * 56000; break; case AD_LINK_SPEED_100000MBPS: bandwidth = nports * 100000; break; case AD_LINK_SPEED_200000MBPS: bandwidth = nports * 200000; break; case AD_LINK_SPEED_400000MBPS: bandwidth = nports * 400000; break; case AD_LINK_SPEED_800000MBPS: bandwidth = nports * 800000; break; case AD_LINK_SPEED_1600000MBPS: bandwidth = nports * 1600000; break; default: bandwidth = 0; /* to silence the compiler */ } } return bandwidth; } /** * __get_active_agg - get the current active aggregator * @aggregator: the aggregator we're looking at * * Caller must hold RCU lock. */ static struct aggregator *__get_active_agg(struct aggregator *aggregator) { struct bonding *bond = aggregator->slave->bond; struct list_head *iter; struct slave *slave; bond_for_each_slave_rcu(bond, slave, iter) if (SLAVE_AD_INFO(slave)->aggregator.is_active) return &(SLAVE_AD_INFO(slave)->aggregator); return NULL; } /** * __update_lacpdu_from_port - update a port's lacpdu fields * @port: the port we're looking at */ static inline void __update_lacpdu_from_port(struct port *port) { struct lacpdu *lacpdu = &port->lacpdu; const struct port_params *partner = &port->partner_oper; /* update current actual Actor parameters * lacpdu->subtype initialized * lacpdu->version_number initialized * lacpdu->tlv_type_actor_info initialized * lacpdu->actor_information_length initialized */ lacpdu->actor_system_priority = htons(port->actor_system_priority); lacpdu->actor_system = port->actor_system; lacpdu->actor_key = htons(port->actor_oper_port_key); lacpdu->actor_port_priority = htons(port->actor_port_priority); lacpdu->actor_port = htons(port->actor_port_number); lacpdu->actor_state = port->actor_oper_port_state; slave_dbg(port->slave->bond->dev, port->slave->dev, "update lacpdu: actor port state %x\n", port->actor_oper_port_state); /* lacpdu->reserved_3_1 initialized * lacpdu->tlv_type_partner_info initialized * lacpdu->partner_information_length initialized */ lacpdu->partner_system_priority = htons(partner->system_priority); lacpdu->partner_system = partner->system; lacpdu->partner_key = htons(partner->key); lacpdu->partner_port_priority = htons(partner->port_priority); lacpdu->partner_port = htons(partner->port_number); lacpdu->partner_state = partner->port_state; /* lacpdu->reserved_3_2 initialized * lacpdu->tlv_type_collector_info initialized * lacpdu->collector_information_length initialized * collector_max_delay initialized * reserved_12[12] initialized * tlv_type_terminator initialized * terminator_length initialized * reserved_50[50] initialized */ } /* ================= main 802.3ad protocol code ========================= */ /** * ad_lacpdu_send - send out a lacpdu packet on a given port * @port: the port we're looking at * * Returns: 0 on success * < 0 on error */ static int ad_lacpdu_send(struct port *port) { struct slave *slave = port->slave; struct sk_buff *skb; struct lacpdu_header *lacpdu_header; int length = sizeof(struct lacpdu_header); skb = dev_alloc_skb(length); if (!skb) return -ENOMEM; atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.lacpdu_tx); skb->dev = slave->dev; skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; skb->priority = TC_PRIO_CONTROL; lacpdu_header = skb_put(skb, length); ether_addr_copy(lacpdu_header->hdr.h_dest, lacpdu_mcast_addr); /* Note: source address is set to be the member's PERMANENT address, * because we use it to identify loopback lacpdus in receive. */ ether_addr_copy(lacpdu_header->hdr.h_source, slave->perm_hwaddr); lacpdu_header->hdr.h_proto = PKT_TYPE_LACPDU; lacpdu_header->lacpdu = port->lacpdu; dev_queue_xmit(skb); return 0; } /** * ad_marker_send - send marker information/response on a given port * @port: the port we're looking at * @marker: marker data to send * * Returns: 0 on success * < 0 on error */ static int ad_marker_send(struct port *port, struct bond_marker *marker) { struct slave *slave = port->slave; struct sk_buff *skb; struct bond_marker_header *marker_header; int length = sizeof(struct bond_marker_header); skb = dev_alloc_skb(length + 16); if (!skb) return -ENOMEM; switch (marker->tlv_type) { case AD_MARKER_INFORMATION_SUBTYPE: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.marker_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.marker_tx); break; case AD_MARKER_RESPONSE_SUBTYPE: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.marker_resp_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.marker_resp_tx); break; } skb_reserve(skb, 16); skb->dev = slave->dev; skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; marker_header = skb_put(skb, length); ether_addr_copy(marker_header->hdr.h_dest, lacpdu_mcast_addr); /* Note: source address is set to be the member's PERMANENT address, * because we use it to identify loopback MARKERs in receive. */ ether_addr_copy(marker_header->hdr.h_source, slave->perm_hwaddr); marker_header->hdr.h_proto = PKT_TYPE_LACPDU; marker_header->marker = *marker; dev_queue_xmit(skb); return 0; } static void ad_cond_set_peer_notif(struct port *port) { struct bonding *bond = port->slave->bond; if (bond->params.broadcast_neighbor && rtnl_trylock()) { bond->send_peer_notif = bond->params.num_peer_notif * max(1, bond->params.peer_notif_delay); rtnl_unlock(); } } /** * ad_mux_machine - handle a port's mux state machine * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_mux_machine(struct port *port, bool *update_slave_arr) { struct bonding *bond = __get_bond_by_port(port); mux_states_t last_state; /* keep current State Machine state to compare later if it was * changed */ last_state = port->sm_mux_state; if (port->sm_vars & AD_PORT_BEGIN) { port->sm_mux_state = AD_MUX_DETACHED; } else { switch (port->sm_mux_state) { case AD_MUX_DETACHED: if ((port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY)) /* if SELECTED or STANDBY */ port->sm_mux_state = AD_MUX_WAITING; break; case AD_MUX_WAITING: /* if SELECTED == FALSE return to DETACH state */ if (!(port->sm_vars & AD_PORT_SELECTED)) { port->sm_vars &= ~AD_PORT_READY_N; /* in order to withhold the Selection Logic to * check all ports READY_N value every callback * cycle to update ready variable, we check * READY_N and update READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); port->sm_mux_state = AD_MUX_DETACHED; break; } /* check if the wait_while_timer expired */ if (port->sm_mux_timer_counter && !(--port->sm_mux_timer_counter)) port->sm_vars |= AD_PORT_READY_N; /* in order to withhold the selection logic to check * all ports READY_N value every callback cycle to * update ready variable, we check READY_N and update * READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); /* if the wait_while_timer expired, and the port is * in READY state, move to ATTACHED state */ if ((port->sm_vars & AD_PORT_READY) && !port->sm_mux_timer_counter) port->sm_mux_state = AD_MUX_ATTACHED; break; case AD_MUX_ATTACHED: /* check also if agg_select_timer expired (so the * edable port will take place only after this timer) */ if ((port->sm_vars & AD_PORT_SELECTED) && (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) && !__check_agg_selection_timer(port)) { if (port->aggregator->is_active) { int state = AD_MUX_COLLECTING_DISTRIBUTING; if (!bond->params.coupled_control) state = AD_MUX_COLLECTING; port->sm_mux_state = state; } } else if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY)) { /* if UNSELECTED or STANDBY */ port->sm_vars &= ~AD_PORT_READY_N; /* in order to withhold the selection logic to * check all ports READY_N value every callback * cycle to update ready variable, we check * READY_N and update READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); port->sm_mux_state = AD_MUX_DETACHED; } else if (port->aggregator->is_active) { port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; } break; case AD_MUX_COLLECTING_DISTRIBUTING: if (!__port_move_to_attached_state(port)) { /* if port state hasn't changed make * sure that a collecting distributing * port in an active aggregator is enabled */ if (port->aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; } } break; case AD_MUX_COLLECTING: if (!__port_move_to_attached_state(port)) { if ((port->sm_vars & AD_PORT_SELECTED) && (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) && (port->partner_oper.port_state & LACP_STATE_COLLECTING)) { port->sm_mux_state = AD_MUX_DISTRIBUTING; } else { /* If port state hasn't changed, make sure that a collecting * port is enabled for an active aggregator. */ struct slave *slave = port->slave; if (port->aggregator->is_active && bond_is_slave_rx_disabled(slave)) { ad_enable_collecting(port); *update_slave_arr = true; } } } break; case AD_MUX_DISTRIBUTING: if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY) || !(port->partner_oper.port_state & LACP_STATE_COLLECTING) || !(port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) || !(port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) { port->sm_mux_state = AD_MUX_COLLECTING; } else { /* if port state hasn't changed make * sure that a collecting distributing * port in an active aggregator is enabled */ if (port->aggregator && port->aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; } } break; default: break; } } /* check if the state machine was changed */ if (port->sm_mux_state != last_state) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Mux Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_mux_state); switch (port->sm_mux_state) { case AD_MUX_DETACHED: port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; ad_disable_collecting_distributing(port, update_slave_arr); port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->ntt = true; break; case AD_MUX_WAITING: port->sm_mux_timer_counter = __ad_timer_to_ticks(AD_WAIT_WHILE_TIMER, 0); break; case AD_MUX_ATTACHED: if (port->aggregator->is_active) port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; else port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; ad_disable_collecting_distributing(port, update_slave_arr); port->ntt = true; break; case AD_MUX_COLLECTING_DISTRIBUTING: port->actor_oper_port_state |= LACP_STATE_COLLECTING; port->actor_oper_port_state |= LACP_STATE_DISTRIBUTING; port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; ad_enable_collecting_distributing(port, update_slave_arr); port->ntt = true; break; case AD_MUX_COLLECTING: port->actor_oper_port_state |= LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; ad_enable_collecting(port); ad_disable_distributing(port, update_slave_arr); port->ntt = true; break; case AD_MUX_DISTRIBUTING: port->actor_oper_port_state |= LACP_STATE_DISTRIBUTING; port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; ad_enable_collecting_distributing(port, update_slave_arr); break; default: break; } } } /** * ad_rx_machine - handle a port's rx State Machine * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * If lacpdu arrived, stop previous timer (if exists) and set the next state as * CURRENT. If timer expired set the state machine in the proper state. * In other cases, this function checks if we need to switch to other state. */ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) { rx_states_t last_state; /* keep current State Machine state to compare later if it was * changed */ last_state = port->sm_rx_state; if (lacpdu) { atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.lacpdu_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.lacpdu_rx); } /* check if state machine should change state */ /* first, check if port was reinitialized */ if (port->sm_vars & AD_PORT_BEGIN) { port->sm_rx_state = AD_RX_INITIALIZE; port->sm_vars |= AD_PORT_CHURNED; /* check if port is not enabled */ } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled) port->sm_rx_state = AD_RX_PORT_DISABLED; /* check if new lacpdu arrived */ else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) || (port->sm_rx_state == AD_RX_DEFAULTED) || (port->sm_rx_state == AD_RX_CURRENT))) { if (port->sm_rx_state != AD_RX_CURRENT) port->sm_vars |= AD_PORT_CHURNED; port->sm_rx_timer_counter = 0; port->sm_rx_state = AD_RX_CURRENT; } else { /* if timer is on, and if it is expired */ if (port->sm_rx_timer_counter && !(--port->sm_rx_timer_counter)) { switch (port->sm_rx_state) { case AD_RX_EXPIRED: port->sm_rx_state = AD_RX_DEFAULTED; break; case AD_RX_CURRENT: port->sm_rx_state = AD_RX_EXPIRED; break; default: break; } } else { /* if no lacpdu arrived and no timer is on */ switch (port->sm_rx_state) { case AD_RX_PORT_DISABLED: if (port->is_enabled && (port->sm_vars & AD_PORT_LACP_ENABLED)) port->sm_rx_state = AD_RX_EXPIRED; else if (port->is_enabled && ((port->sm_vars & AD_PORT_LACP_ENABLED) == 0)) port->sm_rx_state = AD_RX_LACP_DISABLED; break; default: break; } } } /* check if the State machine was changed or new lacpdu arrived */ if ((port->sm_rx_state != last_state) || (lacpdu)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Rx Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_rx_state); switch (port->sm_rx_state) { case AD_RX_INITIALIZE: if (!(port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS)) port->sm_vars &= ~AD_PORT_LACP_ENABLED; else port->sm_vars |= AD_PORT_LACP_ENABLED; port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; port->sm_rx_state = AD_RX_PORT_DISABLED; fallthrough; case AD_RX_PORT_DISABLED: port->sm_vars &= ~AD_PORT_MATCHED; break; case AD_RX_LACP_DISABLED: port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->partner_oper.port_state &= ~LACP_STATE_AGGREGATION; port->sm_vars |= AD_PORT_MATCHED; port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; case AD_RX_EXPIRED: /* Reset of the Synchronization flag (Standard 43.4.12) * This reset cause to disable this port in the * COLLECTING_DISTRIBUTING state of the mux machine in * case of EXPIRED even if LINK_DOWN didn't arrive for * the port. */ port->sm_vars &= ~AD_PORT_MATCHED; /* Based on IEEE 8021AX-2014, Figure 6-18 - Receive * machine state diagram, the statue should be * Partner_Oper_Port_State.Synchronization = FALSE; * Partner_Oper_Port_State.LACP_Timeout = Short Timeout; * start current_while_timer(Short Timeout); * Actor_Oper_Port_State.Expired = TRUE; */ port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION; port->partner_oper.port_state |= LACP_STATE_LACP_TIMEOUT; port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(AD_SHORT_TIMEOUT)); port->actor_oper_port_state |= LACP_STATE_EXPIRED; port->sm_vars |= AD_PORT_CHURNED; break; case AD_RX_DEFAULTED: __update_default_selected(port); __record_default(port); port->sm_vars |= AD_PORT_MATCHED; port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; case AD_RX_CURRENT: /* detect loopback situation */ if (MAC_ADDRESS_EQUAL(&(lacpdu->actor_system), &(port->actor_system))) { slave_err(port->slave->bond->dev, port->slave->dev, "An illegal loopback occurred on slave\n" "Check the configuration to verify that all adapters are connected to 802.3ad compliant switch ports\n"); return; } __update_selected(lacpdu, port); __update_ntt(lacpdu, port); __record_pdu(lacpdu, port); port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(port->actor_oper_port_state & LACP_STATE_LACP_TIMEOUT)); port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; default: break; } } } /** * ad_churn_machine - handle port churn's state machine * @port: the port we're looking at * */ static void ad_churn_machine(struct port *port) { if (port->sm_vars & AD_PORT_CHURNED) { port->sm_vars &= ~AD_PORT_CHURNED; port->sm_churn_actor_state = AD_CHURN_MONITOR; port->sm_churn_partner_state = AD_CHURN_MONITOR; port->sm_churn_actor_timer_counter = __ad_timer_to_ticks(AD_ACTOR_CHURN_TIMER, 0); port->sm_churn_partner_timer_counter = __ad_timer_to_ticks(AD_PARTNER_CHURN_TIMER, 0); return; } if (port->sm_churn_actor_timer_counter && !(--port->sm_churn_actor_timer_counter) && port->sm_churn_actor_state == AD_CHURN_MONITOR) { if (port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION) { port->sm_churn_actor_state = AD_NO_CHURN; } else { port->churn_actor_count++; port->sm_churn_actor_state = AD_CHURN; } } if (port->sm_churn_partner_timer_counter && !(--port->sm_churn_partner_timer_counter) && port->sm_churn_partner_state == AD_CHURN_MONITOR) { if (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) { port->sm_churn_partner_state = AD_NO_CHURN; } else { port->churn_partner_count++; port->sm_churn_partner_state = AD_CHURN; } } } /** * ad_tx_machine - handle a port's tx state machine * @port: the port we're looking at */ static void ad_tx_machine(struct port *port) { /* check if tx timer expired, to verify that we do not send more than * 3 packets per second */ if (!port->sm_tx_timer_counter || !(--port->sm_tx_timer_counter)) { /* check if there is something to send */ if (port->ntt && (port->sm_vars & AD_PORT_LACP_ENABLED)) { __update_lacpdu_from_port(port); if (ad_lacpdu_send(port) >= 0) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Sent LACPDU on port %d\n", port->actor_port_number); /* mark ntt as false, so it will not be sent * again until demanded */ port->ntt = false; /* restart tx timer(to verify that we will not * exceed AD_MAX_TX_IN_SECOND */ port->sm_tx_timer_counter = ad_ticks_per_sec / AD_MAX_TX_IN_SECOND; } } } } /** * ad_periodic_machine - handle a port's periodic state machine * @port: the port we're looking at * * Turn ntt flag on priodically to perform periodic transmission of lacpdu's. */ static void ad_periodic_machine(struct port *port) { periodic_states_t last_state; /* keep current state machine state to compare later if it was changed */ last_state = port->sm_periodic_state; /* check if port was reinitialized */ if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) || (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY))) { port->sm_periodic_state = AD_NO_PERIODIC; } /* check if state machine should change state */ else if (port->sm_periodic_timer_counter) { /* check if periodic state machine expired */ if (!(--port->sm_periodic_timer_counter)) { /* if expired then do tx */ port->sm_periodic_state = AD_PERIODIC_TX; } else { /* If not expired, check if there is some new timeout * parameter from the partner state */ switch (port->sm_periodic_state) { case AD_FAST_PERIODIC: if (!(port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) port->sm_periodic_state = AD_SLOW_PERIODIC; break; case AD_SLOW_PERIODIC: if ((port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) { port->sm_periodic_timer_counter = 0; port->sm_periodic_state = AD_PERIODIC_TX; } break; default: break; } } } else { switch (port->sm_periodic_state) { case AD_NO_PERIODIC: port->sm_periodic_state = AD_FAST_PERIODIC; break; case AD_PERIODIC_TX: if (!(port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) port->sm_periodic_state = AD_SLOW_PERIODIC; else port->sm_periodic_state = AD_FAST_PERIODIC; break; default: break; } } /* check if the state machine was changed */ if (port->sm_periodic_state != last_state) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Periodic Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_periodic_state); switch (port->sm_periodic_state) { case AD_NO_PERIODIC: port->sm_periodic_timer_counter = 0; break; case AD_FAST_PERIODIC: /* decrement 1 tick we lost in the PERIODIC_TX cycle */ port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_FAST_PERIODIC_TIME))-1; break; case AD_SLOW_PERIODIC: /* decrement 1 tick we lost in the PERIODIC_TX cycle */ port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_SLOW_PERIODIC_TIME))-1; break; case AD_PERIODIC_TX: port->ntt = true; break; default: break; } } } /** * ad_port_selection_logic - select aggregation groups * @port: the port we're looking at * @update_slave_arr: Does slave array need update? * * Select aggregation groups, and assign each port for it's aggregetor. The * selection logic is called in the inititalization (after all the handshkes), * and after every lacpdu receive (if selected is off). */ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) { struct aggregator *aggregator, *free_aggregator = NULL, *temp_aggregator; struct port *last_port = NULL, *curr_port; struct list_head *iter; struct bonding *bond; struct slave *slave; int found = 0; /* if the port is already Selected, do nothing */ if (port->sm_vars & AD_PORT_SELECTED) return; bond = __get_bond_by_port(port); /* if the port is connected to other aggregator, detach it */ if (port->aggregator) { /* detach the port from its former aggregator */ temp_aggregator = port->aggregator; for (curr_port = temp_aggregator->lag_ports; curr_port; last_port = curr_port, curr_port = curr_port->next_port_in_aggregator) { if (curr_port == port) { temp_aggregator->num_of_ports--; /* if it is the first port attached to the * aggregator */ if (!last_port) { temp_aggregator->lag_ports = port->next_port_in_aggregator; } else { /* not the first port attached to the * aggregator */ last_port->next_port_in_aggregator = port->next_port_in_aggregator; } /* clear the port's relations to this * aggregator */ port->aggregator = NULL; port->next_port_in_aggregator = NULL; port->actor_port_aggregator_identifier = 0; slave_dbg(bond->dev, port->slave->dev, "Port %d left LAG %d\n", port->actor_port_number, temp_aggregator->aggregator_identifier); /* if the aggregator is empty, clear its * parameters, and set it ready to be attached */ if (!temp_aggregator->lag_ports) ad_clear_agg(temp_aggregator); break; } } if (!curr_port) { /* meaning: the port was related to an aggregator * but was not on the aggregator port list */ net_warn_ratelimited("%s: (slave %s): Warning: Port %d was related to aggregator %d but was not on its port list\n", port->slave->bond->dev->name, port->slave->dev->name, port->actor_port_number, port->aggregator->aggregator_identifier); } } /* search on all aggregators for a suitable aggregator for this port */ bond_for_each_slave(bond, slave, iter) { aggregator = &(SLAVE_AD_INFO(slave)->aggregator); /* keep a free aggregator for later use(if needed) */ if (!aggregator->lag_ports) { if (!free_aggregator) free_aggregator = aggregator; continue; } /* check if current aggregator suits us */ if (((aggregator->actor_oper_aggregator_key == port->actor_oper_port_key) && /* if all parameters match AND */ MAC_ADDRESS_EQUAL(&(aggregator->partner_system), &(port->partner_oper.system)) && (aggregator->partner_system_priority == port->partner_oper.system_priority) && (aggregator->partner_oper_aggregator_key == port->partner_oper.key) ) && ((__agg_has_partner(aggregator) && /* partner answers */ !aggregator->is_individual) /* but is not individual OR */ ) ) { /* attach to the founded aggregator */ port->aggregator = aggregator; port->actor_port_aggregator_identifier = port->aggregator->aggregator_identifier; port->next_port_in_aggregator = aggregator->lag_ports; port->aggregator->num_of_ports++; aggregator->lag_ports = port; slave_dbg(bond->dev, slave->dev, "Port %d joined LAG %d (existing LAG)\n", port->actor_port_number, port->aggregator->aggregator_identifier); /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; found = 1; break; } } /* the port couldn't find an aggregator - attach it to a new * aggregator */ if (!found) { if (free_aggregator) { /* assign port a new aggregator */ port->aggregator = free_aggregator; port->actor_port_aggregator_identifier = port->aggregator->aggregator_identifier; /* update the new aggregator's parameters * if port was responsed from the end-user */ if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS) /* if port is full duplex */ port->aggregator->is_individual = false; else port->aggregator->is_individual = true; port->aggregator->actor_admin_aggregator_key = port->actor_admin_port_key; port->aggregator->actor_oper_aggregator_key = port->actor_oper_port_key; port->aggregator->partner_system = port->partner_oper.system; port->aggregator->partner_system_priority = port->partner_oper.system_priority; port->aggregator->partner_oper_aggregator_key = port->partner_oper.key; port->aggregator->receive_state = 1; port->aggregator->transmit_state = 1; port->aggregator->lag_ports = port; port->aggregator->num_of_ports++; /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; slave_dbg(bond->dev, port->slave->dev, "Port %d joined LAG %d (new LAG)\n", port->actor_port_number, port->aggregator->aggregator_identifier); } else { slave_err(bond->dev, port->slave->dev, "Port %d did not find a suitable aggregator\n", port->actor_port_number); return; } } /* if all aggregator's ports are READY_N == TRUE, set ready=TRUE * in all aggregator's ports, else set ready=FALSE in all * aggregator's ports */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); aggregator = __get_first_agg(port); ad_agg_selection_logic(aggregator, update_slave_arr); if (!port->aggregator->is_active) port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; } /* Decide if "agg" is a better choice for the new active aggregator that * the current best, according to the ad_select policy. */ static struct aggregator *ad_agg_selection_test(struct aggregator *best, struct aggregator *curr) { /* 0. If no best, select current. * * 1. If the current agg is not individual, and the best is * individual, select current. * * 2. If current agg is individual and the best is not, keep best. * * 3. Therefore, current and best are both individual or both not * individual, so: * * 3a. If current agg partner replied, and best agg partner did not, * select current. * * 3b. If current agg partner did not reply and best agg partner * did reply, keep best. * * 4. Therefore, current and best both have partner replies or * both do not, so perform selection policy: * * BOND_AD_PRIO: Select by total priority of ports. If priority * is equal, select by count. * * BOND_AD_COUNT: Select by count of ports. If count is equal, * select by bandwidth. * * BOND_AD_STABLE, BOND_AD_BANDWIDTH: Select by bandwidth. */ if (!best) return curr; if (!curr->is_individual && best->is_individual) return curr; if (curr->is_individual && !best->is_individual) return best; if (__agg_has_partner(curr) && !__agg_has_partner(best)) return curr; if (!__agg_has_partner(curr) && __agg_has_partner(best)) return best; switch (__get_agg_selection_mode(curr->lag_ports)) { case BOND_AD_PRIO: if (__agg_ports_priority(curr) > __agg_ports_priority(best)) return curr; if (__agg_ports_priority(curr) < __agg_ports_priority(best)) return best; fallthrough; case BOND_AD_COUNT: if (__agg_active_ports(curr) > __agg_active_ports(best)) return curr; if (__agg_active_ports(curr) < __agg_active_ports(best)) return best; fallthrough; case BOND_AD_STABLE: case BOND_AD_BANDWIDTH: if (__get_agg_bandwidth(curr) > __get_agg_bandwidth(best)) return curr; break; default: net_warn_ratelimited("%s: (slave %s): Impossible agg select mode %d\n", curr->slave->bond->dev->name, curr->slave->dev->name, __get_agg_selection_mode(curr->lag_ports)); break; } return best; } static int agg_device_up(const struct aggregator *agg) { struct port *port = agg->lag_ports; if (!port) return 0; for (port = agg->lag_ports; port; port = port->next_port_in_aggregator) { if (netif_running(port->slave->dev) && netif_carrier_ok(port->slave->dev)) return 1; } return 0; } /** * ad_agg_selection_logic - select an aggregation group for a team * @agg: the aggregator we're looking at * @update_slave_arr: Does slave array need update? * * It is assumed that only one aggregator may be selected for a team. * * The logic of this function is to select the aggregator according to * the ad_select policy: * * BOND_AD_STABLE: select the aggregator with the most ports attached to * it, and to reselect the active aggregator only if the previous * aggregator has no more ports related to it. * * BOND_AD_BANDWIDTH: select the aggregator with the highest total * bandwidth, and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * * BOND_AD_COUNT: select the aggregator with largest number of ports * (slaves), and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * * BOND_AD_PRIO: select the aggregator with highest total priority of ports * (slaves), and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * * FIXME: this function MUST be called with the first agg in the bond, or * __get_active_agg() won't work correctly. This function should be better * called with the bond itself, and retrieve the first agg from it. */ static void ad_agg_selection_logic(struct aggregator *agg, bool *update_slave_arr) { struct aggregator *best, *active, *origin; struct bonding *bond = agg->slave->bond; struct list_head *iter; struct slave *slave; struct port *port; rcu_read_lock(); origin = agg; active = __get_active_agg(agg); best = (active && agg_device_up(active)) ? active : NULL; bond_for_each_slave_rcu(bond, slave, iter) { agg = &(SLAVE_AD_INFO(slave)->aggregator); agg->is_active = 0; if (__agg_active_ports(agg) && agg_device_up(agg)) best = ad_agg_selection_test(best, agg); } if (best && __get_agg_selection_mode(best->lag_ports) == BOND_AD_STABLE) { /* For the STABLE policy, don't replace the old active * aggregator if it's still active (it has an answering * partner) or if both the best and active don't have an * answering partner. */ if (active && active->lag_ports && __agg_active_ports(active) && (__agg_has_partner(active) || (!__agg_has_partner(active) && !__agg_has_partner(best)))) { if (!(!active->actor_oper_aggregator_key && best->actor_oper_aggregator_key)) { best = NULL; active->is_active = 1; } } } if (best && (best == active)) { best = NULL; active->is_active = 1; } /* if there is new best aggregator, activate it */ if (best) { netdev_dbg(bond->dev, "(slave %s): best Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier, best->num_of_ports, best->actor_oper_aggregator_key, best->partner_oper_aggregator_key, best->is_individual, best->is_active); netdev_dbg(bond->dev, "(slave %s): best ports %p slave %p\n", best->slave ? best->slave->dev->name : "NULL", best->lag_ports, best->slave); bond_for_each_slave_rcu(bond, slave, iter) { agg = &(SLAVE_AD_INFO(slave)->aggregator); slave_dbg(bond->dev, slave->dev, "Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", agg->aggregator_identifier, agg->num_of_ports, agg->actor_oper_aggregator_key, agg->partner_oper_aggregator_key, agg->is_individual, agg->is_active); } /* check if any partner replies */ if (best->is_individual) net_warn_ratelimited("%s: Warning: No 802.3ad response from the link partner for any adapters in the bond\n", bond->dev->name); best->is_active = 1; netdev_dbg(bond->dev, "(slave %s): LAG %d chosen as the active LAG\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier); netdev_dbg(bond->dev, "(slave %s): Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier, best->num_of_ports, best->actor_oper_aggregator_key, best->partner_oper_aggregator_key, best->is_individual, best->is_active); /* disable the ports that were related to the former * active_aggregator */ if (active) { for (port = active->lag_ports; port; port = port->next_port_in_aggregator) { __disable_port(port); } } /* Slave array needs update. */ *update_slave_arr = true; } /* if the selected aggregator is of join individuals * (partner_system is NULL), enable their ports */ active = __get_active_agg(origin); if (active) { if (!__agg_has_partner(active)) { for (port = active->lag_ports; port; port = port->next_port_in_aggregator) { __enable_port(port); } *update_slave_arr = true; } } rcu_read_unlock(); bond_3ad_set_carrier(bond); } /** * ad_clear_agg - clear a given aggregator's parameters * @aggregator: the aggregator we're looking at */ static void ad_clear_agg(struct aggregator *aggregator) { if (aggregator) { aggregator->is_individual = false; aggregator->actor_admin_aggregator_key = 0; aggregator->actor_oper_aggregator_key = 0; eth_zero_addr(aggregator->partner_system.mac_addr_value); aggregator->partner_system_priority = 0; aggregator->partner_oper_aggregator_key = 0; aggregator->receive_state = 0; aggregator->transmit_state = 0; aggregator->lag_ports = NULL; aggregator->is_active = 0; aggregator->num_of_ports = 0; pr_debug("%s: LAG %d was cleared\n", aggregator->slave ? aggregator->slave->dev->name : "NULL", aggregator->aggregator_identifier); } } /** * ad_initialize_agg - initialize a given aggregator's parameters * @aggregator: the aggregator we're looking at */ static void ad_initialize_agg(struct aggregator *aggregator) { if (aggregator) { ad_clear_agg(aggregator); eth_zero_addr(aggregator->aggregator_mac_address.mac_addr_value); aggregator->aggregator_identifier = 0; aggregator->slave = NULL; } } /** * ad_initialize_port - initialize a given port's parameters * @port: the port we're looking at * @bond_params: bond parameters we will use */ static void ad_initialize_port(struct port *port, const struct bond_params *bond_params) { static const struct port_params tmpl = { .system_priority = 0xffff, .key = 1, .port_number = 1, .port_priority = 0xff, .port_state = 0, }; static const struct lacpdu lacpdu = { .subtype = 0x01, .version_number = 0x01, .tlv_type_actor_info = 0x01, .actor_information_length = 0x14, .tlv_type_partner_info = 0x02, .partner_information_length = 0x14, .tlv_type_collector_info = 0x03, .collector_information_length = 0x10, .collector_max_delay = htons(AD_COLLECTOR_MAX_DELAY), }; if (port) { port->actor_port_priority = 0xff; port->actor_port_aggregator_identifier = 0; port->ntt = false; port->actor_admin_port_state = LACP_STATE_AGGREGATION; port->actor_oper_port_state = LACP_STATE_AGGREGATION; if (bond_params->lacp_active) { port->actor_admin_port_state |= LACP_STATE_LACP_ACTIVITY; port->actor_oper_port_state |= LACP_STATE_LACP_ACTIVITY; } if (bond_params->lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; memcpy(&port->partner_admin, &tmpl, sizeof(tmpl)); memcpy(&port->partner_oper, &tmpl, sizeof(tmpl)); port->is_enabled = true; /* private parameters */ port->sm_vars = AD_PORT_BEGIN | AD_PORT_LACP_ENABLED; port->sm_rx_state = 0; port->sm_rx_timer_counter = 0; port->sm_periodic_state = 0; port->sm_periodic_timer_counter = 0; port->sm_mux_state = 0; port->sm_mux_timer_counter = 0; port->sm_tx_state = 0; port->aggregator = NULL; port->next_port_in_aggregator = NULL; port->transaction_id = 0; port->sm_churn_actor_timer_counter = 0; port->sm_churn_actor_state = 0; port->churn_actor_count = 0; port->sm_churn_partner_timer_counter = 0; port->sm_churn_partner_state = 0; port->churn_partner_count = 0; memcpy(&port->lacpdu, &lacpdu, sizeof(lacpdu)); } } /** * ad_enable_collecting - enable a port's receive * @port: the port we're looking at * * Enable @port if it's in an active aggregator */ static void ad_enable_collecting(struct port *port) { if (port->aggregator->is_active) { struct slave *slave = port->slave; slave_dbg(slave->bond->dev, slave->dev, "Enabling collecting on port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __enable_collecting_port(port); } } /** * ad_disable_distributing - disable a port's transmit * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_disable_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator && __agg_has_partner(port->aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling distributing on port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __disable_distributing_port(port); /* Slave array needs an update */ *update_slave_arr = true; } } /** * ad_enable_collecting_distributing - enable a port's transmit/receive * @port: the port we're looking at * @update_slave_arr: Does slave array need update? * * Enable @port if it's in an active aggregator */ static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator->is_active) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Enabling port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __enable_port(port); /* Slave array needs update */ *update_slave_arr = true; /* Should notify peers if possible */ ad_cond_set_peer_notif(port); } } /** * ad_disable_collecting_distributing - disable a port's transmit/receive * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator && __agg_has_partner(port->aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __disable_port(port); /* Slave array needs an update */ *update_slave_arr = true; } } /** * ad_marker_info_received - handle receive of a Marker information frame * @marker_info: Marker info received * @port: the port we're looking at */ static void ad_marker_info_received(struct bond_marker *marker_info, struct port *port) { struct bond_marker marker; atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.marker_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.marker_rx); /* copy the received marker data to the response marker */ memcpy(&marker, marker_info, sizeof(struct bond_marker)); /* change the marker subtype to marker response */ marker.tlv_type = AD_MARKER_RESPONSE_SUBTYPE; /* send the marker response */ if (ad_marker_send(port, &marker) >= 0) slave_dbg(port->slave->bond->dev, port->slave->dev, "Sent Marker Response on port %d\n", port->actor_port_number); } /** * ad_marker_response_received - handle receive of a marker response frame * @marker: marker PDU received * @port: the port we're looking at * * This function does nothing since we decided not to implement send and handle * response for marker PDU's, in this stage, but only to respond to marker * information. */ static void ad_marker_response_received(struct bond_marker *marker, struct port *port) { atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.marker_resp_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.marker_resp_rx); /* DO NOTHING, SINCE WE DECIDED NOT TO IMPLEMENT THIS FEATURE FOR NOW */ } /* ========= AD exported functions to the main bonding code ========= */ /* Check aggregators status in team every T seconds */ #define AD_AGGREGATOR_SELECTION_TIMER 8 /** * bond_3ad_initiate_agg_selection - initate aggregator selection * @bond: bonding struct * @timeout: timeout value to set * * Set the aggregation selection timer, to initiate an agg selection in * the very near future. Called during first initialization, and during * any down to up transitions of the bond. */ void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout) { atomic_set(&BOND_AD_INFO(bond).agg_select_timer, timeout); } /** * bond_3ad_initialize - initialize a bond's 802.3ad parameters and structures * @bond: bonding struct to work on * * Can be called only after the mac address of the bond is set. */ void bond_3ad_initialize(struct bonding *bond) { BOND_AD_INFO(bond).aggregator_identifier = 0; BOND_AD_INFO(bond).system.sys_priority = bond->params.ad_actor_sys_prio; if (is_zero_ether_addr(bond->params.ad_actor_system)) BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr); else BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->params.ad_actor_system); bond_3ad_initiate_agg_selection(bond, AD_AGGREGATOR_SELECTION_TIMER * ad_ticks_per_sec); } /** * bond_3ad_bind_slave - initialize a slave's port * @slave: slave struct to work on * * Returns: 0 on success * < 0 on error */ void bond_3ad_bind_slave(struct slave *slave) { struct bonding *bond = bond_get_bond_by_slave(slave); struct port *port; struct aggregator *aggregator; /* check that the slave has not been initialized yet. */ if (SLAVE_AD_INFO(slave)->port.slave != slave) { /* port initialization */ port = &(SLAVE_AD_INFO(slave)->port); ad_initialize_port(port, &bond->params); /* Port priority is initialized. Update it to slave's ad info */ SLAVE_AD_INFO(slave)->port_priority = port->actor_port_priority; port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; /* key is determined according to the link speed, duplex and * user key */ port->actor_admin_port_key = bond->params.ad_user_port_key << 6; ad_update_actor_keys(port, false); /* actor system is the bond's system */ __ad_actor_update_port(port); /* tx timer(to verify that no more than MAX_TX_IN_SECOND * lacpdu's are sent in one second) */ port->sm_tx_timer_counter = ad_ticks_per_sec/AD_MAX_TX_IN_SECOND; __disable_port(port); /* aggregator initialization */ aggregator = &(SLAVE_AD_INFO(slave)->aggregator); ad_initialize_agg(aggregator); aggregator->aggregator_mac_address = *((struct mac_addr *)bond->dev->dev_addr); aggregator->aggregator_identifier = ++BOND_AD_INFO(bond).aggregator_identifier; aggregator->slave = slave; aggregator->is_active = 0; aggregator->num_of_ports = 0; } } /** * bond_3ad_unbind_slave - deinitialize a slave's port * @slave: slave struct to work on * * Search for the aggregator that is related to this port, remove the * aggregator and assign another aggregator for other port related to it * (if any), and remove the port. */ void bond_3ad_unbind_slave(struct slave *slave) { struct port *port, *prev_port, *temp_port; struct aggregator *aggregator, *new_aggregator, *temp_aggregator; int select_new_active_agg = 0; struct bonding *bond = slave->bond; struct slave *slave_iter; struct list_head *iter; bool dummy_slave_update; /* Ignore this value as caller updates array */ /* Sync against bond_3ad_state_machine_handler() */ spin_lock_bh(&bond->mode_lock); aggregator = &(SLAVE_AD_INFO(slave)->aggregator); port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(bond->dev, slave->dev, "Trying to unbind an uninitialized port\n"); goto out; } slave_dbg(bond->dev, slave->dev, "Unbinding Link Aggregation Group %d\n", aggregator->aggregator_identifier); /* Tell the partner that this port is not suitable for aggregation */ port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->actor_oper_port_state &= ~LACP_STATE_AGGREGATION; __update_lacpdu_from_port(port); ad_lacpdu_send(port); /* check if this aggregator is occupied */ if (aggregator->lag_ports) { /* check if there are other ports related to this aggregator * except the port related to this slave(thats ensure us that * there is a reason to search for new aggregator, and that we * will find one */ if ((aggregator->lag_ports != port) || (aggregator->lag_ports->next_port_in_aggregator)) { /* find new aggregator for the related port(s) */ bond_for_each_slave(bond, slave_iter, iter) { new_aggregator = &(SLAVE_AD_INFO(slave_iter)->aggregator); /* if the new aggregator is empty, or it is * connected to our port only */ if (!new_aggregator->lag_ports || ((new_aggregator->lag_ports == port) && !new_aggregator->lag_ports->next_port_in_aggregator)) break; } if (!slave_iter) new_aggregator = NULL; /* if new aggregator found, copy the aggregator's * parameters and connect the related lag_ports to the * new aggregator */ if ((new_aggregator) && ((!new_aggregator->lag_ports) || ((new_aggregator->lag_ports == port) && !new_aggregator->lag_ports->next_port_in_aggregator))) { slave_dbg(bond->dev, slave->dev, "Some port(s) related to LAG %d - replacing with LAG %d\n", aggregator->aggregator_identifier, new_aggregator->aggregator_identifier); if ((new_aggregator->lag_ports == port) && new_aggregator->is_active) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); select_new_active_agg = 1; } new_aggregator->is_individual = aggregator->is_individual; new_aggregator->actor_admin_aggregator_key = aggregator->actor_admin_aggregator_key; new_aggregator->actor_oper_aggregator_key = aggregator->actor_oper_aggregator_key; new_aggregator->partner_system = aggregator->partner_system; new_aggregator->partner_system_priority = aggregator->partner_system_priority; new_aggregator->partner_oper_aggregator_key = aggregator->partner_oper_aggregator_key; new_aggregator->receive_state = aggregator->receive_state; new_aggregator->transmit_state = aggregator->transmit_state; new_aggregator->lag_ports = aggregator->lag_ports; new_aggregator->is_active = aggregator->is_active; new_aggregator->num_of_ports = aggregator->num_of_ports; /* update the information that is written on * the ports about the aggregator */ for (temp_port = aggregator->lag_ports; temp_port; temp_port = temp_port->next_port_in_aggregator) { temp_port->aggregator = new_aggregator; temp_port->actor_port_aggregator_identifier = new_aggregator->aggregator_identifier; } ad_clear_agg(aggregator); if (select_new_active_agg) ad_agg_selection_logic(__get_first_agg(port), &dummy_slave_update); } else { slave_warn(bond->dev, slave->dev, "unbinding aggregator, and could not find a new aggregator for its ports\n"); } } else { /* in case that the only port related to this * aggregator is the one we want to remove */ select_new_active_agg = aggregator->is_active; ad_clear_agg(aggregator); if (select_new_active_agg) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); /* select new active aggregator */ temp_aggregator = __get_first_agg(port); if (temp_aggregator) ad_agg_selection_logic(temp_aggregator, &dummy_slave_update); } } } slave_dbg(bond->dev, slave->dev, "Unbinding port %d\n", port->actor_port_number); /* find the aggregator that this port is connected to */ bond_for_each_slave(bond, slave_iter, iter) { temp_aggregator = &(SLAVE_AD_INFO(slave_iter)->aggregator); prev_port = NULL; /* search the port in the aggregator's related ports */ for (temp_port = temp_aggregator->lag_ports; temp_port; prev_port = temp_port, temp_port = temp_port->next_port_in_aggregator) { if (temp_port == port) { /* the aggregator found - detach the port from * this aggregator */ if (prev_port) prev_port->next_port_in_aggregator = temp_port->next_port_in_aggregator; else temp_aggregator->lag_ports = temp_port->next_port_in_aggregator; temp_aggregator->num_of_ports--; if (__agg_active_ports(temp_aggregator) == 0) { select_new_active_agg = temp_aggregator->is_active; if (temp_aggregator->num_of_ports == 0) ad_clear_agg(temp_aggregator); if (select_new_active_agg) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); /* select new active aggregator */ ad_agg_selection_logic(__get_first_agg(port), &dummy_slave_update); } } break; } } } port->slave = NULL; out: spin_unlock_bh(&bond->mode_lock); } /** * bond_3ad_update_ad_actor_settings - reflect change of actor settings to ports * @bond: bonding struct to work on * * If an ad_actor setting gets changed we need to update the individual port * settings so the bond device will use the new values when it gets upped. */ void bond_3ad_update_ad_actor_settings(struct bonding *bond) { struct list_head *iter; struct slave *slave; ASSERT_RTNL(); BOND_AD_INFO(bond).system.sys_priority = bond->params.ad_actor_sys_prio; if (is_zero_ether_addr(bond->params.ad_actor_system)) BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr); else BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->params.ad_actor_system); spin_lock_bh(&bond->mode_lock); bond_for_each_slave(bond, slave, iter) { struct port *port = &(SLAVE_AD_INFO(slave))->port; __ad_actor_update_port(port); port->ntt = true; } spin_unlock_bh(&bond->mode_lock); } /** * bond_agg_timer_advance - advance agg_select_timer * @bond: bonding structure * * Return true when agg_select_timer reaches 0. */ static bool bond_agg_timer_advance(struct bonding *bond) { int val, nval; while (1) { val = atomic_read(&BOND_AD_INFO(bond).agg_select_timer); if (!val) return false; nval = val - 1; if (atomic_cmpxchg(&BOND_AD_INFO(bond).agg_select_timer, val, nval) == val) break; } return nval == 0; } /** * bond_3ad_state_machine_handler - handle state machines timeout * @work: work context to fetch bonding struct to work on from * * The state machine handling concept in this module is to check every tick * which state machine should operate any function. The execution order is * round robin, so when we have an interaction between state machines, the * reply of one to each other might be delayed until next tick. * * This function also complete the initialization when the agg_select_timer * times out, and it selects an aggregator for the ports that are yet not * related to any aggregator, and selects the active aggregator for a bond. */ void bond_3ad_state_machine_handler(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, ad_work.work); struct aggregator *aggregator; struct list_head *iter; struct slave *slave; struct port *port; bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER; bool update_slave_arr = false; /* Lock to protect data accessed by all (e.g., port->sm_vars) and * against running with bond_3ad_unbind_slave. ad_rx_machine may run * concurrently due to incoming LACPDU as well. */ spin_lock_bh(&bond->mode_lock); rcu_read_lock(); /* check if there are any slaves */ if (!bond_has_slaves(bond)) goto re_arm; if (bond_agg_timer_advance(bond)) { slave = bond_first_slave_rcu(bond); port = slave ? &(SLAVE_AD_INFO(slave)->port) : NULL; /* select the active aggregator for the bond */ if (port) { if (!port->slave) { net_warn_ratelimited("%s: Warning: bond's first port is uninitialized\n", bond->dev->name); goto re_arm; } aggregator = __get_first_agg(port); ad_agg_selection_logic(aggregator, &update_slave_arr); } bond_3ad_set_carrier(bond); } /* for each port run the state machines */ bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (!port->slave) { net_warn_ratelimited("%s: Warning: Found an uninitialized port\n", bond->dev->name); goto re_arm; } ad_rx_machine(NULL, port); ad_periodic_machine(port); ad_port_selection_logic(port, &update_slave_arr); ad_mux_machine(port, &update_slave_arr); ad_tx_machine(port); ad_churn_machine(port); /* turn off the BEGIN bit, since we already handled it */ if (port->sm_vars & AD_PORT_BEGIN) port->sm_vars &= ~AD_PORT_BEGIN; } re_arm: bond_for_each_slave_rcu(bond, slave, iter) { if (slave->should_notify) { should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW; break; } } rcu_read_unlock(); spin_unlock_bh(&bond->mode_lock); if (update_slave_arr) bond_slave_arr_work_rearm(bond, 0); if (should_notify_rtnl && rtnl_trylock()) { bond_slave_state_notify(bond); rtnl_unlock(); } queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks); } /** * bond_3ad_rx_indication - handle a received frame * @lacpdu: received lacpdu * @slave: slave struct to work on * * It is assumed that frames that were sent on this NIC don't returned as new * received frames (loopback). Since only the payload is given to this * function, it check for loopback. */ static int bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave) { struct bonding *bond = slave->bond; int ret = RX_HANDLER_ANOTHER; struct bond_marker *marker; struct port *port; atomic64_t *stat; port = &(SLAVE_AD_INFO(slave)->port); if (!port->slave) { net_warn_ratelimited("%s: Warning: port of slave %s is uninitialized\n", slave->dev->name, slave->bond->dev->name); return ret; } switch (lacpdu->subtype) { case AD_TYPE_LACPDU: ret = RX_HANDLER_CONSUMED; slave_dbg(slave->bond->dev, slave->dev, "Received LACPDU on port %d\n", port->actor_port_number); /* Protect against concurrent state machines */ spin_lock(&slave->bond->mode_lock); ad_rx_machine(lacpdu, port); spin_unlock(&slave->bond->mode_lock); break; case AD_TYPE_MARKER: ret = RX_HANDLER_CONSUMED; /* No need to convert fields to Little Endian since we * don't use the marker's fields. */ marker = (struct bond_marker *)lacpdu; switch (marker->tlv_type) { case AD_MARKER_INFORMATION_SUBTYPE: slave_dbg(slave->bond->dev, slave->dev, "Received Marker Information on port %d\n", port->actor_port_number); ad_marker_info_received(marker, port); break; case AD_MARKER_RESPONSE_SUBTYPE: slave_dbg(slave->bond->dev, slave->dev, "Received Marker Response on port %d\n", port->actor_port_number); ad_marker_response_received(marker, port); break; default: slave_dbg(slave->bond->dev, slave->dev, "Received an unknown Marker subtype on port %d\n", port->actor_port_number); stat = &SLAVE_AD_INFO(slave)->stats.marker_unknown_rx; atomic64_inc(stat); stat = &BOND_AD_INFO(bond).stats.marker_unknown_rx; atomic64_inc(stat); } break; default: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_unknown_rx); atomic64_inc(&BOND_AD_INFO(bond).stats.lacpdu_unknown_rx); } return ret; } /** * ad_update_actor_keys - Update the oper / admin keys for a port based on * its current speed and duplex settings. * * @port: the port we'are looking at * @reset: Boolean to just reset the speed and the duplex part of the key * * The logic to change the oper / admin keys is: * (a) A full duplex port can participate in LACP with partner. * (b) When the speed is changed, LACP need to be reinitiated. */ static void ad_update_actor_keys(struct port *port, bool reset) { u8 duplex = 0; u16 ospeed = 0, speed = 0; u16 old_oper_key = port->actor_oper_port_key; port->actor_admin_port_key &= ~(AD_SPEED_KEY_MASKS|AD_DUPLEX_KEY_MASKS); if (!reset) { speed = __get_link_speed(port); ospeed = (old_oper_key & AD_SPEED_KEY_MASKS) >> 1; duplex = __get_duplex(port); port->actor_admin_port_key |= (speed << 1) | duplex; } port->actor_oper_port_key = port->actor_admin_port_key; if (old_oper_key != port->actor_oper_port_key) { /* Only 'duplex' port participates in LACP */ if (duplex) port->sm_vars |= AD_PORT_LACP_ENABLED; else port->sm_vars &= ~AD_PORT_LACP_ENABLED; if (!reset) { if (!speed) { slave_err(port->slave->bond->dev, port->slave->dev, "speed changed to 0 on port %d\n", port->actor_port_number); } else if (duplex && ospeed != speed) { /* Speed change restarts LACP state-machine */ port->sm_vars |= AD_PORT_BEGIN; } } } } /** * bond_3ad_adapter_speed_duplex_changed - handle a slave's speed / duplex * change indication * * @slave: slave struct to work on * * Handle reselection of aggregator (if needed) for this port. */ void bond_3ad_adapter_speed_duplex_changed(struct slave *slave) { struct port *port; port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(slave->bond->dev, slave->dev, "speed/duplex changed for uninitialized port\n"); return; } spin_lock_bh(&slave->bond->mode_lock); ad_update_actor_keys(port, false); spin_unlock_bh(&slave->bond->mode_lock); slave_dbg(slave->bond->dev, slave->dev, "Port %d changed speed/duplex\n", port->actor_port_number); } /** * bond_3ad_handle_link_change - handle a slave's link status change indication * @slave: slave struct to work on * @link: whether the link is now up or down * * Handle reselection of aggregator (if needed) for this port. */ void bond_3ad_handle_link_change(struct slave *slave, char link) { struct aggregator *agg; struct port *port; bool dummy; port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(slave->bond->dev, slave->dev, "link status changed for uninitialized port\n"); return; } spin_lock_bh(&slave->bond->mode_lock); /* on link down we are zeroing duplex and speed since * some of the adaptors(ce1000.lan) report full duplex/speed * instead of N/A(duplex) / 0(speed). * * on link up we are forcing recheck on the duplex and speed since * some of he adaptors(ce1000.lan) report. */ if (link == BOND_LINK_UP) { port->is_enabled = true; ad_update_actor_keys(port, false); } else { /* link has failed */ port->is_enabled = false; ad_update_actor_keys(port, true); } agg = __get_first_agg(port); ad_agg_selection_logic(agg, &dummy); spin_unlock_bh(&slave->bond->mode_lock); slave_dbg(slave->bond->dev, slave->dev, "Port %d changed link status to %s\n", port->actor_port_number, link == BOND_LINK_UP ? "UP" : "DOWN"); /* RTNL is held and mode_lock is released so it's safe * to update slave_array here. */ bond_update_slave_arr(slave->bond, NULL); } /** * bond_3ad_set_carrier - set link state for bonding master * @bond: bonding structure * * if we have an active aggregator, we're up, if not, we're down. * Presumes that we cannot have an active aggregator if there are * no slaves with link up. * * This behavior complies with IEEE 802.3 section 43.3.9. * * Called by bond_set_carrier(). Return zero if carrier state does not * change, nonzero if it does. */ int bond_3ad_set_carrier(struct bonding *bond) { struct aggregator *active; struct slave *first_slave; int ret = 1; rcu_read_lock(); first_slave = bond_first_slave_rcu(bond); if (!first_slave) { ret = 0; goto out; } active = __get_active_agg(&(SLAVE_AD_INFO(first_slave)->aggregator)); if (active) { /* are enough slaves available to consider link up? */ if (__agg_active_ports(active) < bond->params.min_links) { if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); goto out; } } else if (!netif_carrier_ok(bond->dev)) { netif_carrier_on(bond->dev); goto out; } } else if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); } out: rcu_read_unlock(); return ret; } /** * __bond_3ad_get_active_agg_info - get information of the active aggregator * @bond: bonding struct to work on * @ad_info: ad_info struct to fill with the bond's info * * Returns: 0 on success * < 0 on error */ int __bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { struct aggregator *aggregator = NULL; struct list_head *iter; struct slave *slave; struct port *port; bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (port->aggregator && port->aggregator->is_active) { aggregator = port->aggregator; break; } } if (!aggregator) return -1; ad_info->aggregator_id = aggregator->aggregator_identifier; ad_info->ports = __agg_active_ports(aggregator); ad_info->actor_key = aggregator->actor_oper_aggregator_key; ad_info->partner_key = aggregator->partner_oper_aggregator_key; ether_addr_copy(ad_info->partner_system, aggregator->partner_system.mac_addr_value); return 0; } int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { int ret; rcu_read_lock(); ret = __bond_3ad_get_active_agg_info(bond, ad_info); rcu_read_unlock(); return ret; } int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct lacpdu *lacpdu, _lacpdu; if (skb->protocol != PKT_TYPE_LACPDU) return RX_HANDLER_ANOTHER; if (!MAC_ADDRESS_EQUAL(eth_hdr(skb)->h_dest, lacpdu_mcast_addr)) return RX_HANDLER_ANOTHER; lacpdu = skb_header_pointer(skb, 0, sizeof(_lacpdu), &_lacpdu); if (!lacpdu) { atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_illegal_rx); atomic64_inc(&BOND_AD_INFO(bond).stats.lacpdu_illegal_rx); return RX_HANDLER_ANOTHER; } return bond_3ad_rx_indication(lacpdu, slave); } /** * bond_3ad_update_lacp_rate - change the lacp rate * @bond: bonding struct * * When modify lacp_rate parameter via sysfs, * update actor_oper_port_state of each port. * * Hold bond->mode_lock, * so we can modify port->actor_oper_port_state, * no matter bond is up or down. */ void bond_3ad_update_lacp_rate(struct bonding *bond) { struct port *port = NULL; struct list_head *iter; struct slave *slave; int lacp_fast; lacp_fast = bond->params.lacp_fast; spin_lock_bh(&bond->mode_lock); bond_for_each_slave(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; else port->actor_oper_port_state &= ~LACP_STATE_LACP_TIMEOUT; } spin_unlock_bh(&bond->mode_lock); } /** * bond_3ad_update_lacp_active - change the lacp active * @bond: bonding struct * * Update actor_oper_port_state when lacp_active is modified. */ void bond_3ad_update_lacp_active(struct bonding *bond) { struct port *port = NULL; struct list_head *iter; struct slave *slave; int lacp_active; lacp_active = bond->params.lacp_active; spin_lock_bh(&bond->mode_lock); bond_for_each_slave(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (lacp_active) port->actor_oper_port_state |= LACP_STATE_LACP_ACTIVITY; else port->actor_oper_port_state &= ~LACP_STATE_LACP_ACTIVITY; } spin_unlock_bh(&bond->mode_lock); } size_t bond_3ad_stats_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_TX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_UNKNOWN_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_ILLEGAL_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_TX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_TX */ nla_total_size_64bit(sizeof(u64)); /* BOND_3AD_STAT_MARKER_UNKNOWN_RX */ } int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats) { u64 val; val = atomic64_read(&stats->lacpdu_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_unknown_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_UNKNOWN_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_illegal_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_ILLEGAL_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_resp_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_resp_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_unknown_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_UNKNOWN_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; return 0; }
14 14 11 3 40 39 1 1 39 38 37 38 38 24 3 11 4 29 1 30 30 29 1 289 56 247 291 5 5 5 5 3 3 3 3 3 31 1 31 16 14 1 14 23 9 4 2 4 7 31 32 32 146 2 147 145 147 551 536 31 31 31 27 214 11 27 943 904 808 102 808 5 1 808 807 575 102 483 303 303 5 9 346 346 7 36 259 261 259 4 2 2 4 2 8 2 2 2 13 31 13 2 16 31 31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 International Business Machines, Corp. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * These functions handle all input from the IP layer into SCTP. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Xingang Guo <xingang.guo@intel.com> * Jon Grimm <jgrimm@us.ibm.com> * Hui Huang <hui.huang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/time.h> /* For struct timeval */ #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/snmp.h> #include <net/sock.h> #include <net/xfrm.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/checksum.h> #include <net/net_namespace.h> #include <linux/rhashtable.h> #include <net/sock_reuseport.h> /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif); static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, const union sctp_addr *daddr, int dif, int sdif); static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, struct sctp_transport **pt, int dif, int sdif); static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb); /* Calculate the SCTP checksum of an SCTP packet. */ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) { struct sctphdr *sh = sctp_hdr(skb); __le32 cmp = sh->checksum; __le32 val = sctp_compute_cksum(skb, 0); if (val != cmp) { /* CRC failure, dump it. */ __SCTP_INC_STATS(net, SCTP_MIB_CHECKSUMERRORS); return -1; } return 0; } /* * This is the routine which IP calls when receiving an SCTP packet. */ int sctp_rcv(struct sk_buff *skb) { struct sock *sk; struct sctp_association *asoc; struct sctp_endpoint *ep = NULL; struct sctp_ep_common *rcvr; struct sctp_transport *transport = NULL; struct sctp_chunk *chunk; union sctp_addr src; union sctp_addr dest; int family; struct sctp_af *af; struct net *net = dev_net(skb->dev); bool is_gso = skb_is_gso(skb) && skb_is_gso_sctp(skb); int dif, sdif; if (skb->pkt_type != PACKET_HOST) goto discard_it; __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS); /* If packet is too small to contain a single chunk, let's not * waste time on it anymore. */ if (skb->len < sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + skb_transport_offset(skb)) goto discard_it; /* If the packet is fragmented and we need to do crc checking, * it's better to just linearize it otherwise crc computing * takes longer. */ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || !pskb_may_pull(skb, sizeof(struct sctphdr))) goto discard_it; /* Pull up the IP header. */ __skb_pull(skb, skb_transport_offset(skb)); skb->csum_valid = 0; /* Previous value not applicable */ if (skb_csum_unnecessary(skb)) __skb_decr_checksum_unnecessary(skb); else if (!sctp_checksum_disable && !is_gso && sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb->csum_valid = 1; __skb_pull(skb, sizeof(struct sctphdr)); family = ipver2af(ip_hdr(skb)->version); af = sctp_get_af_specific(family); if (unlikely(!af)) goto discard_it; SCTP_INPUT_CB(skb)->af = af; /* Initialize local addresses for lookups. */ af->from_skb(&src, skb, 1); af->from_skb(&dest, skb, 0); dif = af->skb_iif(skb); sdif = af->skb_sdif(skb); /* If the packet is to or from a non-unicast address, * silently discard the packet. * * This is not clearly defined in the RFC except in section * 8.4 - OOTB handling. However, based on the book "Stream Control * Transmission Protocol" 2.1, "It is important to note that the * IP address of an SCTP transport address must be a routable * unicast address. In other words, IP multicast addresses and * IP broadcast addresses cannot be used in an SCTP transport * address." */ if (!af->addr_valid(&src, NULL, skb) || !af->addr_valid(&dest, NULL, skb)) goto discard_it; asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport, dif, sdif); if (!asoc) ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src, dif, sdif); /* Retrieve the common input handling substructure. */ rcvr = asoc ? &asoc->base : &ep->base; sk = rcvr->sk; /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * An SCTP packet is called an "out of the blue" (OOTB) * packet if it is correctly formed, i.e., passed the * receiver's checksum check, but the receiver is not * able to identify the association to which this * packet belongs. */ if (!asoc) { if (sctp_rcv_ootb(skb)) { __SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES); goto discard_release; } } if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) goto discard_release; nf_reset_ct(skb); if (sk_filter(sk, skb) || skb->len < sizeof(struct sctp_chunkhdr)) goto discard_release; /* Create an SCTP packet structure. */ chunk = sctp_chunkify(skb, asoc, sk, GFP_ATOMIC); if (!chunk) goto discard_release; SCTP_INPUT_CB(skb)->chunk = chunk; /* Remember what endpoint is to handle this packet. */ chunk->rcvr = rcvr; /* Remember the SCTP header. */ chunk->sctp_hdr = sctp_hdr(skb); /* Set the source and destination addresses of the incoming chunk. */ sctp_init_addrs(chunk, &src, &dest); /* Remember where we came from. */ chunk->transport = transport; /* Acquire access to the sock lock. Note: We are safe from other * bottom halves on this lock, but a user may be in the lock too, * so check if it is busy. */ bh_lock_sock(sk); if (sk != rcvr->sk) { /* Our cached sk is different from the rcvr->sk. This is * because migrate()/accept() may have moved the association * to a new socket and released all the sockets. So now we * are holding a lock on the old socket while the user may * be doing something with the new socket. Switch our veiw * of the current sk. */ bh_unlock_sock(sk); sk = rcvr->sk; bh_lock_sock(sk); } if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sctp_add_backlog(sk, skb)) { bh_unlock_sock(sk); sctp_chunk_free(chunk); skb = NULL; /* sctp_chunk_free already freed the skb */ goto discard_release; } __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_BACKLOG); } else { __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_SOFTIRQ); sctp_inq_push(&chunk->rcvr->inqueue, chunk); } bh_unlock_sock(sk); /* Release the asoc/ep ref we took in the lookup calls. */ if (transport) sctp_transport_put(transport); else sctp_endpoint_put(ep); return 0; discard_it: __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS); kfree_skb(skb); return 0; discard_release: /* Release the asoc/ep ref we took in the lookup calls. */ if (transport) sctp_transport_put(transport); else sctp_endpoint_put(ep); goto discard_it; } /* Process the backlog queue of the socket. Every skb on * the backlog holds a ref on an association or endpoint. * We hold this ref throughout the state machine to make * sure that the structure we need is still around. */ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_inq *inqueue = &chunk->rcvr->inqueue; struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = NULL; int backloged = 0; rcvr = chunk->rcvr; /* If the rcvr is dead then the association or endpoint * has been deleted and we can safely drop the chunk * and refs that we are holding. */ if (rcvr->dead) { sctp_chunk_free(chunk); goto done; } if (unlikely(rcvr->sk != sk)) { /* In this case, the association moved from one socket to * another. We are currently sitting on the backlog of the * old socket, so we need to move. * However, since we are here in the process context we * need to take make sure that the user doesn't own * the new socket when we process the packet. * If the new socket is user-owned, queue the chunk to the * backlog of the new socket without dropping any refs. * Otherwise, we can safely push the chunk on the inqueue. */ sk = rcvr->sk; local_bh_disable(); bh_lock_sock(sk); if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) sctp_chunk_free(chunk); else backloged = 1; } else sctp_inq_push(inqueue, chunk); bh_unlock_sock(sk); local_bh_enable(); /* If the chunk was backloged again, don't drop refs */ if (backloged) return 0; } else { if (!sctp_newsk_ready(sk)) { if (!sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) return 0; sctp_chunk_free(chunk); } else { sctp_inq_push(inqueue, chunk); } } done: /* Release the refs we took in sctp_add_backlog */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) sctp_transport_put(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_put(sctp_ep(rcvr)); else BUG(); return 0; } static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = chunk->rcvr; int ret; ret = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); if (!ret) { /* Hold the assoc/ep while hanging on the backlog queue. * This way, we know structures we need will not disappear * from us */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) sctp_transport_hold(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_hold(sctp_ep(rcvr)); else BUG(); } return ret; } /* Handle icmp frag needed error. */ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { if (!t || (t->pathmtu <= pmtu && t->pl.probe_size + sctp_transport_pl_hlen(t) <= pmtu)) return; if (sock_owned_by_user(sk)) { atomic_set(&t->mtu_info, pmtu); asoc->pmtu_pending = 1; t->pmtu_pending = 1; return; } if (!(t->param_flags & SPP_PMTUD_ENABLE)) /* We can't allow retransmitting in such case, as the * retransmission would be sized just as before, and thus we * would get another icmp, and retransmit again. */ return; /* Update transports view of the MTU. Return if no update was needed. * If an update wasn't needed/possible, it also doesn't make sense to * try to retransmit now. */ if (!sctp_transport_update_pmtu(t, pmtu)) return; /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); /* Retransmit with the new pmtu setting. */ sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t, struct sk_buff *skb) { struct dst_entry *dst; if (sock_owned_by_user(sk) || !t) return; dst = sctp_transport_dst_check(t); if (dst) dst->ops->redirect(dst, sk, skb); } /* * SCTP Implementer's Guide, 2.37 ICMP handling procedures * * ICMP8) If the ICMP code is a "Unrecognized next header type encountered" * or a "Protocol Unreachable" treat this message as an abort * with the T bit set. * * This function sends an event to the state machine, which will abort the * association. * */ void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t) { if (sock_owned_by_user(sk)) { if (timer_pending(&t->proto_unreach_timer)) return; else { if (!mod_timer(&t->proto_unreach_timer, jiffies + (HZ/20))) sctp_transport_hold(t); } } else { struct net *net = sock_net(sk); pr_debug("%s: unrecognized next header type " "encountered!\n", __func__); if (timer_delete(&t->proto_unreach_timer)) sctp_transport_put(t); sctp_do_sm(net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), asoc->state, asoc->ep, asoc, t, GFP_ATOMIC); } } /* Common lookup code for icmp/icmpv6 error handler. */ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, struct sctphdr *sctphdr, struct sctp_association **app, struct sctp_transport **tpp) { struct sctp_init_chunk *chunkhdr, _chunkhdr; union sctp_addr saddr; union sctp_addr daddr; struct sctp_af *af; struct sock *sk = NULL; struct sctp_association *asoc; struct sctp_transport *transport = NULL; __u32 vtag = ntohl(sctphdr->vtag); int sdif = inet_sdif(skb); int dif = inet_iif(skb); *app = NULL; *tpp = NULL; af = sctp_get_af_specific(family); if (unlikely(!af)) { return NULL; } /* Initialize local addresses for lookups. */ af->from_skb(&saddr, skb, 1); af->from_skb(&daddr, skb, 0); /* Look for an association that matches the incoming ICMP error * packet. */ asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport, dif, sdif); if (!asoc) return NULL; sk = asoc->base.sk; /* RFC 4960, Appendix C. ICMP Handling * * ICMP6) An implementation MUST validate that the Verification Tag * contained in the ICMP message matches the Verification Tag of * the peer. If the Verification Tag is not 0 and does NOT * match, discard the ICMP message. If it is 0 and the ICMP * message contains enough bytes to verify that the chunk type is * an INIT chunk and that the Initiate Tag matches the tag of the * peer, continue with ICMP7. If the ICMP message is too short * or the chunk type or the Initiate Tag does not match, silently * discard the packet. */ if (vtag == 0) { /* chunk header + first 4 octects of init header */ chunkhdr = skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(struct sctphdr), sizeof(struct sctp_chunkhdr) + sizeof(__be32), &_chunkhdr); if (!chunkhdr || chunkhdr->chunk_hdr.type != SCTP_CID_INIT || ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) goto out; } else if (vtag != asoc->c.peer_vtag) { goto out; } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); *app = asoc; *tpp = transport; return sk; out: sctp_transport_put(transport); return NULL; } /* Common cleanup code for icmp/icmpv6 error handler. */ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) __releases(&((__sk)->sk_lock.slock)) { bh_unlock_sock(sk); sctp_transport_put(t); } static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb, __u8 type, __u8 code, __u32 info) { struct sctp_association *asoc = t->asoc; struct sock *sk = asoc->base.sk; int err = 0; switch (type) { case ICMP_PARAMETERPROB: err = EPROTO; break; case ICMP_DEST_UNREACH: if (code > NR_ICMP_UNREACH) return; if (code == ICMP_FRAG_NEEDED) { sctp_icmp_frag_needed(sk, asoc, t, SCTP_TRUNC4(info)); return; } if (code == ICMP_PROT_UNREACH) { sctp_icmp_proto_unreachable(sk, asoc, t); return; } err = icmp_err_convert[code].errno; break; case ICMP_TIME_EXCEEDED: if (code == ICMP_EXC_FRAGTIME) return; err = EHOSTUNREACH; break; case ICMP_REDIRECT: sctp_icmp_redirect(sk, t, skb); return; default: return; } if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { sk->sk_err = err; sk_error_report(sk); } else { /* Only an error on timeout */ WRITE_ONCE(sk->sk_err_soft, err); } } /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the sctp header. We need * to find the appropriate port. * * The locking strategy used here is very "optimistic". When * someone else accesses the socket the ICMP is just dropped * and for some paths there is no check at all. * A more general error queue to queue errors for later handling * is probably better. * */ int sctp_v4_err(struct sk_buff *skb, __u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct net *net = dev_net(skb->dev); struct sctp_transport *transport; struct sctp_association *asoc; __u16 saveip, savesctp; struct sock *sk; /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; skb_reset_network_header(skb); skb_set_transport_header(skb, iph->ihl * 4); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original values. */ skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } sctp_v4_err_handle(transport, skb, type, code, info); sctp_err_finish(sk, transport); return 0; } int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sctp_association *asoc; struct sctp_transport *t; struct icmphdr *hdr; __u32 info = 0; skb->transport_header += sizeof(struct udphdr); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &t); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } skb->transport_header -= sizeof(struct udphdr); hdr = (struct icmphdr *)(skb_network_header(skb) - sizeof(struct icmphdr)); if (hdr->type == ICMP_REDIRECT) { /* can't be handled without outer iphdr known, leave it to udp_err */ sctp_err_finish(sk, t); return 0; } if (hdr->type == ICMP_DEST_UNREACH && hdr->code == ICMP_FRAG_NEEDED) info = ntohs(hdr->un.frag.mtu); sctp_v4_err_handle(t, skb, hdr->type, hdr->code, info); sctp_err_finish(sk, t); return 1; } /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * * This function scans all the chunks in the OOTB packet to determine if * the packet should be discarded right away. If a response might be needed * for this packet, or, if further processing is possible, the packet will * be queued to a proper inqueue for the next phase of handling. * * Output: * Return 0 - If further processing is needed. * Return 1 - If the packet can be discarded right away. */ static int sctp_rcv_ootb(struct sk_buff *skb) { struct sctp_chunkhdr *ch, _ch; int ch_end, offset = 0; /* Scan through all the chunks in the packet. */ do { /* Make sure we have at least the header there */ if (offset + sizeof(_ch) > skb->len) break; ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); /* Break out if chunk length is less then minimal. */ if (!ch || ntohs(ch->length) < sizeof(_ch)) break; ch_end = offset + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb->len) break; /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the * receiver MUST silently discard the OOTB packet and take no * further action. */ if (SCTP_CID_ABORT == ch->type) goto discard; /* RFC 8.4, 6) If the packet contains a SHUTDOWN COMPLETE * chunk, the receiver should silently discard the packet * and take no further action. */ if (SCTP_CID_SHUTDOWN_COMPLETE == ch->type) goto discard; /* RFC 4460, 2.11.2 * This will discard packets with INIT chunk bundled as * subsequent chunks in the packet. When INIT is first, * the normal INIT processing will discard the chunk. */ if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data) goto discard; offset = ch_end; } while (ch_end < skb->len); return 0; discard: return 1; } /* Insert endpoint into the hash table. */ static int __sctp_hash_endpoint(struct sctp_endpoint *ep) { struct sock *sk = ep->base.sk; struct net *net = sock_net(sk); struct sctp_hashbucket *head; int err = 0; ep->hashent = sctp_ep_hashfn(net, ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; write_lock(&head->lock); if (sk->sk_reuseport) { bool any = sctp_is_ep_boundall(sk); struct sctp_endpoint *ep2; struct list_head *list; int cnt = 0; err = 1; list_for_each(list, &ep->base.bind_addr.address_list) cnt++; sctp_for_each_hentry(ep2, &head->chain) { struct sock *sk2 = ep2->base.sk; if (!net_eq(sock_net(sk2), net) || sk2 == sk || !uid_eq(sk_uid(sk2), sk_uid(sk)) || !sk2->sk_reuseport) continue; err = sctp_bind_addrs_check(sctp_sk(sk2), sctp_sk(sk), cnt); if (!err) { err = reuseport_add_sock(sk, sk2, any); if (err) goto out; break; } else if (err < 0) { goto out; } } if (err) { err = reuseport_alloc(sk, any); if (err) goto out; } } hlist_add_head(&ep->node, &head->chain); out: write_unlock(&head->lock); return err; } /* Add an endpoint to the hash. Local BH-safe. */ int sctp_hash_endpoint(struct sctp_endpoint *ep) { int err; local_bh_disable(); err = __sctp_hash_endpoint(ep); local_bh_enable(); return err; } /* Remove endpoint from the hash table. */ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) { struct sock *sk = ep->base.sk; struct sctp_hashbucket *head; ep->hashent = sctp_ep_hashfn(sock_net(sk), ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; write_lock(&head->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); hlist_del_init(&ep->node); write_unlock(&head->lock); } /* Remove endpoint from the hash. Local BH-safe. */ void sctp_unhash_endpoint(struct sctp_endpoint *ep) { local_bh_disable(); __sctp_unhash_endpoint(ep); local_bh_enable(); } static inline __u32 sctp_hashfn(const struct net *net, __be16 lport, const union sctp_addr *paddr, __u32 seed) { __u32 addr; if (paddr->sa.sa_family == AF_INET6) addr = jhash(&paddr->v6.sin6_addr, 16, seed); else addr = (__force __u32)paddr->v4.sin_addr.s_addr; return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | (__force __u32)lport, net_hash_mix(net), seed); } /* Look up an endpoint. */ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct sctp_hashbucket *head; struct sctp_endpoint *ep; struct sock *sk; __be16 lport; int hash; lport = laddr->v4.sin_port; hash = sctp_ep_hashfn(net, ntohs(lport)); head = &sctp_ep_hashtable[hash]; read_lock(&head->lock); sctp_for_each_hentry(ep, &head->chain) { if (sctp_endpoint_is_match(ep, net, laddr, dif, sdif)) goto hit; } ep = sctp_sk(net->sctp.ctl_sock)->ep; hit: sk = ep->base.sk; if (sk->sk_reuseport) { __u32 phash = sctp_hashfn(net, lport, paddr, 0); sk = reuseport_select_sock(sk, phash, skb, sizeof(struct sctphdr)); if (sk) ep = sctp_sk(sk)->ep; } sctp_endpoint_hold(ep); read_unlock(&head->lock); return ep; } /* rhashtable for transport */ struct sctp_hash_cmp_arg { const union sctp_addr *paddr; const struct net *net; __be16 lport; }; static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { struct sctp_transport *t = (struct sctp_transport *)ptr; const struct sctp_hash_cmp_arg *x = arg->key; int err = 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) return err; if (!sctp_transport_hold(t)) return err; if (!net_eq(t->asoc->base.net, x->net)) goto out; if (x->lport != htons(t->asoc->base.bind_addr.port)) goto out; err = 0; out: sctp_transport_put(t); return err; } static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed) { const struct sctp_transport *t = data; return sctp_hashfn(t->asoc->base.net, htons(t->asoc->base.bind_addr.port), &t->ipaddr, seed); } static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed) { const struct sctp_hash_cmp_arg *x = data; return sctp_hashfn(x->net, x->lport, x->paddr, seed); } static const struct rhashtable_params sctp_hash_params = { .head_offset = offsetof(struct sctp_transport, node), .hashfn = sctp_hash_key, .obj_hashfn = sctp_hash_obj, .obj_cmpfn = sctp_hash_cmp, .automatic_shrinking = true, }; int sctp_transport_hashtable_init(void) { return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params); } void sctp_transport_hashtable_destroy(void) { rhltable_destroy(&sctp_transport_hashtable); } int sctp_hash_transport(struct sctp_transport *t) { struct sctp_transport *transport; struct rhlist_head *tmp, *list; struct sctp_hash_cmp_arg arg; int err; if (t->asoc->temp) return 0; arg.net = t->asoc->base.net; arg.paddr = &t->ipaddr; arg.lport = htons(t->asoc->base.bind_addr.port); rcu_read_lock(); list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(transport, tmp, list, node) if (transport->asoc->ep == t->asoc->ep) { rcu_read_unlock(); return -EEXIST; } rcu_read_unlock(); err = rhltable_insert_key(&sctp_transport_hashtable, &arg, &t->node, sctp_hash_params); if (err) pr_err_once("insert transport fail, errno %d\n", err); return err; } void sctp_unhash_transport(struct sctp_transport *t) { if (t->asoc->temp) return; rhltable_remove(&sctp_transport_hashtable, &t->node, sctp_hash_params); } bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { bool l3mdev_accept = true; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) l3mdev_accept = !!READ_ONCE(net->sctp.l3mdev_accept); #endif return inet_bound_dev_eq(l3mdev_accept, bound_dev_if, dif, sdif); } /* return a transport with holding it */ struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct rhlist_head *tmp, *list; struct sctp_transport *t; int bound_dev_if; struct sctp_hash_cmp_arg arg = { .paddr = paddr, .net = net, .lport = laddr->v4.sin_port, }; list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(t, tmp, list, node) { if (!sctp_transport_hold(t)) continue; bound_dev_if = READ_ONCE(t->asoc->base.sk->sk_bound_dev_if); if (sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) && sctp_bind_addr_match(&t->asoc->base.bind_addr, laddr, sctp_sk(t->asoc->base.sk))) return t; sctp_transport_put(t); } return NULL; } /* return a transport without holding it, as it's only used under sock lock */ struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr) { struct rhlist_head *tmp, *list; struct sctp_transport *t; struct sctp_hash_cmp_arg arg = { .paddr = paddr, .net = ep->base.net, .lport = htons(ep->base.bind_addr.port), }; list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(t, tmp, list, node) if (ep == t->asoc->ep) return t; return NULL; } /* Look up an association. */ static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, struct sctp_transport **pt, int dif, int sdif) { struct sctp_transport *t; struct sctp_association *asoc = NULL; t = sctp_addrs_lookup_transport(net, local, peer, dif, sdif); if (!t) goto out; asoc = t->asoc; *pt = t; out: return asoc; } /* Look up an association. protected by RCU read lock */ static struct sctp_association *sctp_lookup_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; rcu_read_lock(); asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); rcu_read_unlock(); return asoc; } /* Is there an association matching the given local and peer addresses? */ bool sctp_has_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct sctp_transport *transport; if (sctp_lookup_association(net, laddr, paddr, &transport, dif, sdif)) { sctp_transport_put(transport); return true; } return false; } /* * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. * * D) When searching for a matching TCB upon reception of an INIT * or INIT-ACK chunk the receiver SHOULD use not only the * source address of the packet (containing the INIT or * INIT-ACK) but the receiver SHOULD also use all valid * address parameters contained within the chunk. * * 2.18.3 Solution description * * This new text clearly specifies to an implementor the need * to look within the INIT or INIT-ACK. Any implementation that * does not do this, may not be able to establish associations * in certain circumstances. * */ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; union sctp_addr addr; union sctp_addr *paddr = &addr; struct sctphdr *sh = sctp_hdr(skb); union sctp_params params; struct sctp_init_chunk *init; struct sctp_af *af; /* * This code will NOT touch anything inside the chunk--it is * strictly READ-ONLY. * * RFC 2960 3 SCTP packet Format * * Multiple chunks can be bundled into one SCTP packet up to * the MTU size, except for the INIT, INIT ACK, and SHUTDOWN * COMPLETE chunks. These chunks MUST NOT be bundled with any * other chunk in a packet. See Section 6.10 for more details * on chunk bundling. */ /* Find the start of the TLVs and the end of the chunk. This is * the region we search for address parameters. */ init = (struct sctp_init_chunk *)skb->data; /* Walk the parameters looking for embedded addresses. */ sctp_walk_params(params, init) { /* Note: Ignoring hostname addresses. */ af = sctp_get_af_specific(param_type2af(params.p->type)); if (!af) continue; if (!af->from_addr_param(paddr, params.addr, sh->source, 0)) continue; asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) return asoc; } return NULL; } /* ADD-IP, Section 5.2 * When an endpoint receives an ASCONF Chunk from the remote peer * special procedures may be needed to identify the association the * ASCONF Chunk is associated with. To properly find the association * the following procedures SHOULD be followed: * * D2) If the association is not found, use the address found in the * Address Parameter TLV combined with the port number found in the * SCTP common header. If found proceed to rule D4. * * D2-ext) If more than one ASCONF Chunks are packed together, use the * address found in the ASCONF Address Parameter TLV of each of the * subsequent ASCONF Chunks. If found, proceed to rule D4. */ static struct sctp_association *__sctp_rcv_asconf_lookup( struct net *net, struct sctp_chunkhdr *ch, const union sctp_addr *laddr, __be16 peer_port, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_addip_chunk *asconf = (struct sctp_addip_chunk *)ch; struct sctp_af *af; union sctp_addr_param *param; union sctp_addr paddr; if (ntohs(ch->length) < sizeof(*asconf) + sizeof(struct sctp_paramhdr)) return NULL; /* Skip over the ADDIP header and find the Address parameter */ param = (union sctp_addr_param *)(asconf + 1); af = sctp_get_af_specific(param_type2af(param->p.type)); if (unlikely(!af)) return NULL; if (!af->from_addr_param(&paddr, param, peer_port, 0)) return NULL; return __sctp_lookup_association(net, laddr, &paddr, transportp, dif, sdif); } /* SCTP-AUTH, Section 6.3: * If the receiver does not find a STCB for a packet containing an AUTH * chunk as the first chunk and not a COOKIE-ECHO chunk as the second * chunk, it MUST use the chunks after the AUTH chunk to look up an existing * association. * * This means that any chunks that can help us identify the association need * to be looked at to find this association. */ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc = NULL; struct sctp_chunkhdr *ch; int have_auth = 0; unsigned int chunk_num = 1; __u8 *ch_end; /* Walk through the chunks looking for AUTH or ASCONF chunks * to help us find the association. */ ch = (struct sctp_chunkhdr *)skb->data; do { /* Break out if chunk length is less then minimal. */ if (ntohs(ch->length) < sizeof(*ch)) break; ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb_tail_pointer(skb)) break; switch (ch->type) { case SCTP_CID_AUTH: have_auth = chunk_num; break; case SCTP_CID_COOKIE_ECHO: /* If a packet arrives containing an AUTH chunk as * a first chunk, a COOKIE-ECHO chunk as the second * chunk, and possibly more chunks after them, and * the receiver does not have an STCB for that * packet, then authentication is based on * the contents of the COOKIE- ECHO chunk. */ if (have_auth == 1 && chunk_num == 2) return NULL; break; case SCTP_CID_ASCONF: if (have_auth || net->sctp.addip_noauth) asoc = __sctp_rcv_asconf_lookup( net, ch, laddr, sctp_hdr(skb)->source, transportp, dif, sdif); break; default: break; } if (asoc) break; ch = (struct sctp_chunkhdr *)ch_end; chunk_num++; } while (ch_end + sizeof(*ch) < skb_tail_pointer(skb)); return asoc; } /* * There are circumstances when we need to look inside the SCTP packet * for information to help us find the association. Examples * include looking inside of INIT/INIT-ACK chunks or after the AUTH * chunks. */ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_chunkhdr *ch; /* We do not allow GSO frames here as we need to linearize and * then cannot guarantee frame boundaries. This shouldn't be an * issue as packets hitting this are mostly INIT or INIT-ACK and * those cannot be on GSO-style anyway. */ if (skb_is_gso(skb) && skb_is_gso_sctp(skb)) return NULL; ch = (struct sctp_chunkhdr *)skb->data; /* The code below will attempt to walk the chunk and extract * parameter information. Before we do that, we need to verify * that the chunk length doesn't cause overflow. Otherwise, we'll * walk off the end. */ if (SCTP_PAD4(ntohs(ch->length)) > skb->len) return NULL; /* If this is INIT/INIT-ACK look inside the chunk too. */ if (ch->type == SCTP_CID_INIT || ch->type == SCTP_CID_INIT_ACK) return __sctp_rcv_init_lookup(net, skb, laddr, transportp, dif, sdif); return __sctp_rcv_walk_lookup(net, skb, laddr, transportp, dif, sdif); } /* Lookup an association for an inbound skb. */ static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) goto out; /* Further lookup for INIT/INIT-ACK packets. * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. */ asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp, dif, sdif); if (asoc) goto out; if (paddr->sa.sa_family == AF_INET) pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n", &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port), &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port)); else pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n", &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port), &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port)); out: return asoc; }
2 1 1 3 1 1 1 3 2 1 1 14 2 13 3 1 2 3 3 1 2 3 5 84 4 56 24 5 68 16 1 3 2 52 84 52 84 84 1 1 3 1 2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC key management * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * RxRPC keys should have a description of describing their purpose: * "afs@example.com" */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/key-type.h> #include <linux/ctype.h> #include <linux/slab.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> #include <keys/user-type.h> #include "ar-internal.h" static int rxrpc_preparse(struct key_preparsed_payload *); static void rxrpc_free_preparse(struct key_preparsed_payload *); static void rxrpc_destroy(struct key *); static void rxrpc_describe(const struct key *, struct seq_file *); static long rxrpc_read(const struct key *, char *, size_t); /* * rxrpc defined keys take an arbitrary string as the description and an * arbitrary blob of data as the payload */ struct key_type key_type_rxrpc = { .name = "rxrpc", .flags = KEY_TYPE_NET_DOMAIN, .preparse = rxrpc_preparse, .free_preparse = rxrpc_free_preparse, .instantiate = generic_key_instantiate, .destroy = rxrpc_destroy, .describe = rxrpc_describe, .read = rxrpc_read, }; EXPORT_SYMBOL(key_type_rxrpc); /* * parse an RxKAD type XDR format token * - the caller guarantees we have at least 4 words */ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, size_t datalen, const __be32 *xdr, unsigned int toklen) { struct rxrpc_key_token *token, **pptoken; time64_t expiry; size_t plen; u32 tktlen; _enter(",{%x,%x,%x,%x},%u", ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), toklen); if (toklen <= 8 * 4) return -EKEYREJECTED; tktlen = ntohl(xdr[7]); _debug("tktlen: %x", tktlen); if (tktlen > AFSTOKEN_RK_TIX_MAX) return -EKEYREJECTED; if (toklen < 8 * 4 + tktlen) return -EKEYREJECTED; plen = sizeof(*token) + sizeof(*token->kad) + tktlen; prep->quotalen = datalen + plen; plen -= sizeof(*token); token = kzalloc(sizeof(*token), GFP_KERNEL); if (!token) return -ENOMEM; token->kad = kzalloc(plen, GFP_KERNEL); if (!token->kad) { kfree(token); return -ENOMEM; } token->security_index = RXRPC_SECURITY_RXKAD; token->kad->ticket_len = tktlen; token->kad->vice_id = ntohl(xdr[0]); token->kad->kvno = ntohl(xdr[1]); token->kad->start = ntohl(xdr[4]); token->kad->expiry = ntohl(xdr[5]); token->kad->primary_flag = ntohl(xdr[6]); memcpy(&token->kad->session_key, &xdr[2], 8); memcpy(&token->kad->ticket, &xdr[8], tktlen); _debug("SCIX: %u", token->security_index); _debug("TLEN: %u", token->kad->ticket_len); _debug("EXPY: %x", token->kad->expiry); _debug("KVNO: %u", token->kad->kvno); _debug("PRIM: %u", token->kad->primary_flag); _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", token->kad->session_key[0], token->kad->session_key[1], token->kad->session_key[2], token->kad->session_key[3], token->kad->session_key[4], token->kad->session_key[5], token->kad->session_key[6], token->kad->session_key[7]); if (token->kad->ticket_len >= 8) _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", token->kad->ticket[0], token->kad->ticket[1], token->kad->ticket[2], token->kad->ticket[3], token->kad->ticket[4], token->kad->ticket[5], token->kad->ticket[6], token->kad->ticket[7]); /* count the number of tokens attached */ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); /* attach the data */ for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; *pptoken; pptoken = &(*pptoken)->next) continue; *pptoken = token; expiry = rxrpc_u32_to_time64(token->kad->expiry); if (expiry < prep->expiry) prep->expiry = expiry; _leave(" = 0"); return 0; } static u64 xdr_dec64(const __be32 *xdr) { return (u64)ntohl(xdr[0]) << 32 | (u64)ntohl(xdr[1]); } static time64_t rxrpc_s64_to_time64(s64 time_in_100ns) { bool neg = false; u64 tmp = time_in_100ns; if (time_in_100ns < 0) { tmp = -time_in_100ns; neg = true; } do_div(tmp, 10000000); return neg ? -tmp : tmp; } /* * Parse a YFS-RxGK type XDR format token * - the caller guarantees we have at least 4 words * * struct token_rxgk { * opr_time begintime; * opr_time endtime; * afs_int64 level; * afs_int64 lifetime; * afs_int64 bytelife; * afs_int64 enctype; * opaque key<>; * opaque ticket<>; * }; */ static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, size_t datalen, const __be32 *xdr, unsigned int toklen) { struct rxrpc_key_token *token, **pptoken; time64_t expiry; size_t plen; const __be32 *ticket, *key; s64 tmp; u32 tktlen, keylen; _enter(",{%x,%x,%x,%x},%x", ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), toklen); if (6 * 2 + 2 > toklen / 4) goto reject; key = xdr + (6 * 2 + 1); keylen = ntohl(key[-1]); _debug("keylen: %x", keylen); keylen = round_up(keylen, 4); if ((6 * 2 + 2) * 4 + keylen > toklen) goto reject; ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1); tktlen = ntohl(ticket[-1]); _debug("tktlen: %x", tktlen); tktlen = round_up(tktlen, 4); if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) { kleave(" = -EKEYREJECTED [%x!=%x, %x,%x]", (6 * 2 + 2) * 4 + keylen + tktlen, toklen, keylen, tktlen); goto reject; } plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen; prep->quotalen = datalen + plen; plen -= sizeof(*token); token = kzalloc(sizeof(*token), GFP_KERNEL); if (!token) goto nomem; token->rxgk = kzalloc(sizeof(*token->rxgk) + keylen, GFP_KERNEL); if (!token->rxgk) goto nomem_token; token->security_index = RXRPC_SECURITY_YFS_RXGK; token->rxgk->begintime = xdr_dec64(xdr + 0 * 2); token->rxgk->endtime = xdr_dec64(xdr + 1 * 2); token->rxgk->level = tmp = xdr_dec64(xdr + 2 * 2); if (tmp < -1LL || tmp > RXRPC_SECURITY_ENCRYPT) goto reject_token; token->rxgk->lifetime = xdr_dec64(xdr + 3 * 2); token->rxgk->bytelife = xdr_dec64(xdr + 4 * 2); token->rxgk->enctype = tmp = xdr_dec64(xdr + 5 * 2); if (tmp < 0 || tmp > UINT_MAX) goto reject_token; token->rxgk->key.len = ntohl(key[-1]); token->rxgk->key.data = token->rxgk->_key; token->rxgk->ticket.len = ntohl(ticket[-1]); if (token->rxgk->endtime != 0) { expiry = rxrpc_s64_to_time64(token->rxgk->endtime); if (expiry < 0) goto expired; if (expiry < prep->expiry) prep->expiry = expiry; } memcpy(token->rxgk->key.data, key, token->rxgk->key.len); /* Pad the ticket so that we can use it directly in XDR */ token->rxgk->ticket.data = kzalloc(round_up(token->rxgk->ticket.len, 4), GFP_KERNEL); if (!token->rxgk->ticket.data) goto nomem_yrxgk; memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len); _debug("SCIX: %u", token->security_index); _debug("EXPY: %llx", token->rxgk->endtime); _debug("LIFE: %llx", token->rxgk->lifetime); _debug("BYTE: %llx", token->rxgk->bytelife); _debug("ENC : %u", token->rxgk->enctype); _debug("LEVL: %u", token->rxgk->level); _debug("KLEN: %u", token->rxgk->key.len); _debug("TLEN: %u", token->rxgk->ticket.len); _debug("KEY0: %*phN", token->rxgk->key.len, token->rxgk->key.data); _debug("TICK: %*phN", min_t(u32, token->rxgk->ticket.len, 32), token->rxgk->ticket.data); /* count the number of tokens attached */ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); /* attach the data */ for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; *pptoken; pptoken = &(*pptoken)->next) continue; *pptoken = token; _leave(" = 0"); return 0; nomem_yrxgk: kfree(token->rxgk); nomem_token: kfree(token); nomem: return -ENOMEM; reject_token: kfree(token); reject: return -EKEYREJECTED; expired: kfree(token->rxgk); kfree(token); return -EKEYEXPIRED; } /* * attempt to parse the data as the XDR format * - the caller guarantees we have more than 7 words */ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) { const __be32 *xdr = prep->data, *token, *p; const char *cp; unsigned int len, paddedlen, loop, ntoken, toklen, sec_ix; size_t datalen = prep->datalen; int ret, ret2; _enter(",{%x,%x,%x,%x},%zu", ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), prep->datalen); if (datalen > AFSTOKEN_LENGTH_MAX) goto not_xdr; /* XDR is an array of __be32's */ if (datalen & 3) goto not_xdr; /* the flags should be 0 (the setpag bit must be handled by * userspace) */ if (ntohl(*xdr++) != 0) goto not_xdr; datalen -= 4; /* check the cell name */ len = ntohl(*xdr++); if (len < 1 || len > AFSTOKEN_CELL_MAX) goto not_xdr; datalen -= 4; paddedlen = (len + 3) & ~3; if (paddedlen > datalen) goto not_xdr; cp = (const char *) xdr; for (loop = 0; loop < len; loop++) if (!isprint(cp[loop])) goto not_xdr; for (; loop < paddedlen; loop++) if (cp[loop]) goto not_xdr; _debug("cellname: [%u/%u] '%*.*s'", len, paddedlen, len, len, (const char *) xdr); datalen -= paddedlen; xdr += paddedlen >> 2; /* get the token count */ if (datalen < 12) goto not_xdr; ntoken = ntohl(*xdr++); datalen -= 4; _debug("ntoken: %x", ntoken); if (ntoken < 1 || ntoken > AFSTOKEN_MAX) goto not_xdr; /* check each token wrapper */ p = xdr; loop = ntoken; do { if (datalen < 8) goto not_xdr; toklen = ntohl(*p++); sec_ix = ntohl(*p); datalen -= 4; _debug("token: [%x/%zx] %x", toklen, datalen, sec_ix); paddedlen = (toklen + 3) & ~3; if (toklen < 20 || toklen > datalen || paddedlen > datalen) goto not_xdr; datalen -= paddedlen; p += paddedlen >> 2; } while (--loop > 0); _debug("remainder: %zu", datalen); if (datalen != 0) goto not_xdr; /* okay: we're going to assume it's valid XDR format * - we ignore the cellname, relying on the key to be correctly named */ ret = -EPROTONOSUPPORT; do { toklen = ntohl(*xdr++); token = xdr; xdr += (toklen + 3) / 4; sec_ix = ntohl(*token++); toklen -= 4; _debug("TOKEN type=%x len=%x", sec_ix, toklen); switch (sec_ix) { case RXRPC_SECURITY_RXKAD: ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen); break; case RXRPC_SECURITY_YFS_RXGK: ret2 = rxrpc_preparse_xdr_yfs_rxgk(prep, datalen, token, toklen); break; default: ret2 = -EPROTONOSUPPORT; break; } switch (ret2) { case 0: ret = 0; break; case -EPROTONOSUPPORT: break; case -ENOPKG: if (ret != 0) ret = -ENOPKG; break; default: ret = ret2; goto error; } } while (--ntoken > 0); error: _leave(" = %d", ret); return ret; not_xdr: _leave(" = -EPROTO"); return -EPROTO; } /* * Preparse an rxrpc defined key. * * Data should be of the form: * OFFSET LEN CONTENT * 0 4 key interface version number * 4 2 security index (type) * 6 2 ticket length * 8 4 key expiry time (time_t) * 12 4 kvno * 16 8 session key * 24 [len] ticket * * if no data is provided, then a no-security key is made */ static int rxrpc_preparse(struct key_preparsed_payload *prep) { const struct rxrpc_key_data_v1 *v1; struct rxrpc_key_token *token, **pp; time64_t expiry; size_t plen; u32 kver; int ret; _enter("%zu", prep->datalen); /* handle a no-security key */ if (!prep->data && prep->datalen == 0) return 0; /* determine if the XDR payload format is being used */ if (prep->datalen > 7 * 4) { ret = rxrpc_preparse_xdr(prep); if (ret != -EPROTO) return ret; } /* get the key interface version number */ ret = -EINVAL; if (prep->datalen <= 4 || !prep->data) goto error; memcpy(&kver, prep->data, sizeof(kver)); prep->data += sizeof(kver); prep->datalen -= sizeof(kver); _debug("KEY I/F VERSION: %u", kver); ret = -EKEYREJECTED; if (kver != 1) goto error; /* deal with a version 1 key */ ret = -EINVAL; if (prep->datalen < sizeof(*v1)) goto error; v1 = prep->data; if (prep->datalen != sizeof(*v1) + v1->ticket_length) goto error; _debug("SCIX: %u", v1->security_index); _debug("TLEN: %u", v1->ticket_length); _debug("EXPY: %x", v1->expiry); _debug("KVNO: %u", v1->kvno); _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", v1->session_key[0], v1->session_key[1], v1->session_key[2], v1->session_key[3], v1->session_key[4], v1->session_key[5], v1->session_key[6], v1->session_key[7]); if (v1->ticket_length >= 8) _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", v1->ticket[0], v1->ticket[1], v1->ticket[2], v1->ticket[3], v1->ticket[4], v1->ticket[5], v1->ticket[6], v1->ticket[7]); ret = -EPROTONOSUPPORT; if (v1->security_index != RXRPC_SECURITY_RXKAD) goto error; plen = sizeof(*token->kad) + v1->ticket_length; prep->quotalen = plen + sizeof(*token); ret = -ENOMEM; token = kzalloc(sizeof(*token), GFP_KERNEL); if (!token) goto error; token->kad = kzalloc(plen, GFP_KERNEL); if (!token->kad) goto error_free; token->security_index = RXRPC_SECURITY_RXKAD; token->kad->ticket_len = v1->ticket_length; token->kad->expiry = v1->expiry; token->kad->kvno = v1->kvno; memcpy(&token->kad->session_key, &v1->session_key, 8); memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length); /* count the number of tokens attached */ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); /* attach the data */ pp = (struct rxrpc_key_token **)&prep->payload.data[0]; while (*pp) pp = &(*pp)->next; *pp = token; expiry = rxrpc_u32_to_time64(token->kad->expiry); if (expiry < prep->expiry) prep->expiry = expiry; token = NULL; ret = 0; error_free: kfree(token); error: return ret; } /* * Free token list. */ static void rxrpc_free_token_list(struct rxrpc_key_token *token) { struct rxrpc_key_token *next; for (; token; token = next) { next = token->next; switch (token->security_index) { case RXRPC_SECURITY_RXKAD: kfree(token->kad); break; case RXRPC_SECURITY_YFS_RXGK: kfree(token->rxgk->ticket.data); kfree(token->rxgk); break; default: pr_err("Unknown token type %x on rxrpc key\n", token->security_index); BUG(); } kfree(token); } } /* * Clean up preparse data. */ static void rxrpc_free_preparse(struct key_preparsed_payload *prep) { rxrpc_free_token_list(prep->payload.data[0]); } /* * dispose of the data dangling from the corpse of a rxrpc key */ static void rxrpc_destroy(struct key *key) { rxrpc_free_token_list(key->payload.data[0]); } /* * describe the rxrpc key */ static void rxrpc_describe(const struct key *key, struct seq_file *m) { const struct rxrpc_key_token *token; const char *sep = ": "; seq_puts(m, key->description); for (token = key->payload.data[0]; token; token = token->next) { seq_puts(m, sep); switch (token->security_index) { case RXRPC_SECURITY_RXKAD: seq_puts(m, "ka"); break; case RXRPC_SECURITY_YFS_RXGK: seq_puts(m, "ygk"); break; default: /* we have a ticket we can't encode */ seq_printf(m, "%u", token->security_index); break; } sep = " "; } } /* * grab the security key for a socket */ int rxrpc_request_key(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; _enter(""); if (optlen <= 0 || optlen > PAGE_SIZE - 1 || rx->securities) return -EINVAL; description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk), NULL); if (IS_ERR(key)) { kfree(description); _leave(" = %ld", PTR_ERR(key)); return PTR_ERR(key); } rx->key = key; kfree(description); _leave(" = 0 [key %x]", key->serial); return 0; } /* * generate a server data key */ int rxrpc_get_server_data_key(struct rxrpc_connection *conn, const void *session_key, time64_t expiry, u32 kvno) { const struct cred *cred = current_cred(); struct key *key; int ret; struct { u32 kver; struct rxrpc_key_data_v1 v1; } data; _enter(""); key = key_alloc(&key_type_rxrpc, "x", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(key)) { _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); return -ENOMEM; } _debug("key %d", key_serial(key)); data.kver = 1; data.v1.security_index = RXRPC_SECURITY_RXKAD; data.v1.ticket_length = 0; data.v1.expiry = rxrpc_time64_to_u32(expiry); data.v1.kvno = 0; memcpy(&data.v1.session_key, session_key, sizeof(data.v1.session_key)); ret = key_instantiate_and_link(key, &data, sizeof(data), NULL, NULL); if (ret < 0) goto error; conn->key = key; _leave(" = 0 [%d]", key_serial(key)); return 0; error: key_revoke(key); key_put(key); _leave(" = -ENOMEM [ins %d]", ret); return -ENOMEM; } EXPORT_SYMBOL(rxrpc_get_server_data_key); /** * rxrpc_get_null_key - Generate a null RxRPC key * @keyname: The name to give the key. * * Generate a null RxRPC key that can be used to indicate anonymous security is * required for a particular domain. * * Return: The new key or a negative error code. */ struct key *rxrpc_get_null_key(const char *keyname) { const struct cred *cred = current_cred(); struct key *key; int ret; key = key_alloc(&key_type_rxrpc, keyname, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(key)) return key; ret = key_instantiate_and_link(key, NULL, 0, NULL, NULL); if (ret < 0) { key_revoke(key); key_put(key); return ERR_PTR(ret); } return key; } EXPORT_SYMBOL(rxrpc_get_null_key); /* * read the contents of an rxrpc key * - this returns the result in XDR form */ static long rxrpc_read(const struct key *key, char *buffer, size_t buflen) { const struct rxrpc_key_token *token; size_t size; __be32 *xdr, *oldxdr; u32 cnlen, toksize, ntoks, tok, zero; u16 toksizes[AFSTOKEN_MAX]; _enter(""); /* we don't know what form we should return non-AFS keys in */ if (memcmp(key->description, "afs@", 4) != 0) return -EOPNOTSUPP; cnlen = strlen(key->description + 4); #define RND(X) (((X) + 3) & ~3) /* AFS keys we return in XDR form, so we need to work out the size of * the XDR */ size = 2 * 4; /* flags, cellname len */ size += RND(cnlen); /* cellname */ size += 1 * 4; /* token count */ ntoks = 0; for (token = key->payload.data[0]; token; token = token->next) { toksize = 4; /* sec index */ switch (token->security_index) { case RXRPC_SECURITY_RXKAD: toksize += 8 * 4; /* viceid, kvno, key*2, begin, * end, primary, tktlen */ if (!token->no_leak_key) toksize += RND(token->kad->ticket_len); break; case RXRPC_SECURITY_YFS_RXGK: toksize += 6 * 8 + 2 * 4; if (!token->no_leak_key) toksize += RND(token->rxgk->key.len); toksize += RND(token->rxgk->ticket.len); break; default: /* we have a ticket we can't encode */ pr_err("Unsupported key token type (%u)\n", token->security_index); return -ENOPKG; } _debug("token[%u]: toksize=%u", ntoks, toksize); if (WARN_ON(toksize > AFSTOKEN_LENGTH_MAX)) return -EIO; toksizes[ntoks++] = toksize; size += toksize + 4; /* each token has a length word */ } #undef RND if (!buffer || buflen < size) return size; xdr = (__be32 *)buffer; zero = 0; #define ENCODE(x) \ do { \ *xdr++ = htonl(x); \ } while(0) #define ENCODE_DATA(l, s) \ do { \ u32 _l = (l); \ ENCODE(l); \ memcpy(xdr, (s), _l); \ if (_l & 3) \ memcpy((u8 *)xdr + _l, &zero, 4 - (_l & 3)); \ xdr += (_l + 3) >> 2; \ } while(0) #define ENCODE_BYTES(l, s) \ do { \ u32 _l = (l); \ memcpy(xdr, (s), _l); \ if (_l & 3) \ memcpy((u8 *)xdr + _l, &zero, 4 - (_l & 3)); \ xdr += (_l + 3) >> 2; \ } while(0) #define ENCODE64(x) \ do { \ __be64 y = cpu_to_be64(x); \ memcpy(xdr, &y, 8); \ xdr += 8 >> 2; \ } while(0) #define ENCODE_STR(s) \ do { \ const char *_s = (s); \ ENCODE_DATA(strlen(_s), _s); \ } while(0) ENCODE(0); /* flags */ ENCODE_DATA(cnlen, key->description + 4); /* cellname */ ENCODE(ntoks); tok = 0; for (token = key->payload.data[0]; token; token = token->next) { toksize = toksizes[tok++]; ENCODE(toksize); oldxdr = xdr; ENCODE(token->security_index); switch (token->security_index) { case RXRPC_SECURITY_RXKAD: ENCODE(token->kad->vice_id); ENCODE(token->kad->kvno); ENCODE_BYTES(8, token->kad->session_key); ENCODE(token->kad->start); ENCODE(token->kad->expiry); ENCODE(token->kad->primary_flag); if (token->no_leak_key) ENCODE(0); else ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); break; case RXRPC_SECURITY_YFS_RXGK: ENCODE64(token->rxgk->begintime); ENCODE64(token->rxgk->endtime); ENCODE64(token->rxgk->level); ENCODE64(token->rxgk->lifetime); ENCODE64(token->rxgk->bytelife); ENCODE64(token->rxgk->enctype); if (token->no_leak_key) ENCODE(0); else ENCODE_DATA(token->rxgk->key.len, token->rxgk->key.data); ENCODE_DATA(token->rxgk->ticket.len, token->rxgk->ticket.data); break; default: pr_err("Unsupported key token type (%u)\n", token->security_index); return -ENOPKG; } if (WARN_ON((unsigned long)xdr - (unsigned long)oldxdr != toksize)) return -EIO; } #undef ENCODE_STR #undef ENCODE_DATA #undef ENCODE64 #undef ENCODE if (WARN_ON(tok != ntoks)) return -EIO; if (WARN_ON((unsigned long)xdr - (unsigned long)buffer != size)) return -EIO; _leave(" = %zu", size); return size; }
236 236 236 23 213 213 213 213 158 42 35 37 39 38 39 42 35 112 10 50 74 67 27 107 23 104 50 74 112 10 66 66 66 66 66 66 66 66 66 66 66 16 232 16 4 236 1 252 1 16 236 235 236 23 213 83 158 116 66 117 117 236 1 74 254 2 251 97 177 214 52 233 19 19 6 61 128 1 60 55 74 46 83 247 4 252 74 74 69 15 251 251 252 252 252 252 183 74 183 183 5 54 261 2 258 195 3 63 185 254 1 2 1 12 12 3 3 12 3 72 1 56 15 73 58 40 9 344 1 27 266 1 73 280 2 279 2 167 77 43 1 1 117 1 350 75 276 2 1 89 1 97 4 9 8 101 13 97 353 348 107 9 238 54 295 311 42 353 353 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 // SPDX-License-Identifier: GPL-2.0 /* * mm/mremap.c * * (C) Copyright 1996 Linus Torvalds * * Address space accounting code <alan@lxorguk.ukuu.org.uk> * (C) Copyright 2002 Red Hat Inc, All Rights Reserved */ #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/shm.h> #include <linux/ksm.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/leafops.h> #include <linux/highmem.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/userfaultfd_k.h> #include <linux/mempolicy.h> #include <linux/pgalloc.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include "internal.h" /* Classify the kind of remap operation being performed. */ enum mremap_type { MREMAP_INVALID, /* Initial state. */ MREMAP_NO_RESIZE, /* old_len == new_len, if not moved, do nothing. */ MREMAP_SHRINK, /* old_len > new_len. */ MREMAP_EXPAND, /* old_len < new_len. */ }; /* * Describes a VMA mremap() operation and is threaded throughout it. * * Any of the fields may be mutated by the operation, however these values will * always accurately reflect the remap (for instance, we may adjust lengths and * delta to account for hugetlb alignment). */ struct vma_remap_struct { /* User-provided state. */ unsigned long addr; /* User-specified address from which we remap. */ unsigned long old_len; /* Length of range being remapped. */ unsigned long new_len; /* Desired new length of mapping. */ const unsigned long flags; /* user-specified MREMAP_* flags. */ unsigned long new_addr; /* Optionally, desired new address. */ /* uffd state. */ struct vm_userfaultfd_ctx *uf; struct list_head *uf_unmap_early; struct list_head *uf_unmap; /* VMA state, determined in do_mremap(). */ struct vm_area_struct *vma; /* Internal state, determined in do_mremap(). */ unsigned long delta; /* Absolute delta of old_len,new_len. */ bool populate_expand; /* mlock()'d expanded, must populate. */ enum mremap_type remap_type; /* expand, shrink, etc. */ bool mmap_locked; /* Is mm currently write-locked? */ unsigned long charged; /* If VM_ACCOUNT, # pages to account. */ bool vmi_needs_invalidate; /* Is the VMA iterator invalidated? */ }; static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pgd = pgd_offset(mm, addr); if (pgd_none_or_clear_bad(pgd)) return NULL; p4d = p4d_offset(pgd, addr); if (p4d_none_or_clear_bad(p4d)) return NULL; pud = pud_offset(p4d, addr); if (pud_none_or_clear_bad(pud)) return NULL; return pud; } static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pud_t *pud; pmd_t *pmd; pud = get_old_pud(mm, addr); if (!pud) return NULL; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return NULL; return pmd; } static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; return pud_alloc(mm, p4d, addr); } static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) { pud_t *pud; pmd_t *pmd; pud = alloc_new_pud(mm, addr); if (!pud) return NULL; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return NULL; VM_BUG_ON(pmd_trans_huge(*pmd)); return pmd; } static void take_rmap_locks(struct vm_area_struct *vma) { if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); if (vma->anon_vma) anon_vma_lock_write(vma->anon_vma); } static void drop_rmap_locks(struct vm_area_struct *vma) { if (vma->anon_vma) anon_vma_unlock_write(vma->anon_vma); if (vma->vm_file) i_mmap_unlock_write(vma->vm_file->f_mapping); } static pte_t move_soft_dirty_pte(pte_t pte) { if (pte_none(pte)) return pte; /* * Set soft dirty bit so we can notice * in userspace the ptes were moved. */ if (pgtable_supports_soft_dirty()) { if (pte_present(pte)) pte = pte_mksoft_dirty(pte); else pte = pte_swp_mksoft_dirty(pte); } return pte; } static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int max_nr) { struct folio *folio; if (max_nr == 1) return 1; /* Avoid expensive folio lookup if we stand no chance of benefit. */ if (pte_batch_hint(ptep, pte) == 1) return 1; folio = vm_normal_folio(vma, addr, pte); if (!folio || !folio_test_large(folio)) return 1; return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, FPB_RESPECT_WRITE); } static int move_ptes(struct pagetable_move_control *pmc, unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd) { struct vm_area_struct *vma = pmc->old; bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct mm_struct *mm = vma->vm_mm; pte_t *old_ptep, *new_ptep; pte_t old_pte, pte; pmd_t dummy_pmdval; spinlock_t *old_ptl, *new_ptl; bool force_flush = false; unsigned long old_addr = pmc->old_addr; unsigned long new_addr = pmc->new_addr; unsigned long old_end = old_addr + extent; unsigned long len = old_end - old_addr; int max_nr_ptes; int nr_ptes; int err = 0; /* * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma * locks to ensure that rmap will always observe either the old or the * new ptes. This is the easiest way to avoid races with * truncate_pagecache(), page migration, etc... * * When need_rmap_locks is false, we use other ways to avoid * such races: * * - During exec() shift_arg_pages(), we use a specially tagged vma * which rmap call sites look for using vma_is_temporary_stack(). * * - During mremap(), new_vma is often known to be placed after vma * in rmap traversal order. This ensures rmap will always observe * either the old pte, or the new pte, or both (the page table locks * serialize access to individual ptes, but only rmap traversal * order guarantees that we won't miss both the old and new ptes). */ if (pmc->need_rmap_locks) take_rmap_locks(vma); /* * We don't have to worry about the ordering of src and dst * pte locks because exclusive mmap_lock prevents deadlock. */ old_ptep = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); if (!old_ptep) { err = -EAGAIN; goto out; } /* * Now new_pte is none, so hpage_collapse_scan_file() path can not find * this by traversing file->f_mapping, so there is no concurrency with * retract_page_tables(). In addition, we already hold the exclusive * mmap_lock, so this new_pte page is stable, so there is no need to get * pmdval and do pmd_same() check. */ new_ptep = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval, &new_ptl); if (!new_ptep) { pte_unmap_unlock(old_ptep, old_ptl); err = -EAGAIN; goto out; } if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE, new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) { VM_WARN_ON_ONCE(!pte_none(*new_ptep)); nr_ptes = 1; max_nr_ptes = (old_end - old_addr) >> PAGE_SHIFT; old_pte = ptep_get(old_ptep); if (pte_none(old_pte)) continue; /* * If we are remapping a valid PTE, make sure * to flush TLB before we drop the PTL for the * PTE. * * NOTE! Both old and new PTL matter: the old one * for racing with folio_mkclean(), the new one to * make sure the physical page stays valid until * the TLB entry for the old mapping has been * flushed. */ if (pte_present(old_pte)) { nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep, old_pte, max_nr_ptes); force_flush = true; } pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes); pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) pte_clear(mm, new_addr, new_ptep); else { if (need_clear_uffd_wp) { if (pte_present(pte)) pte = pte_clear_uffd_wp(pte); else pte = pte_swp_clear_uffd_wp(pte); } set_ptes(mm, new_addr, new_ptep, pte, nr_ptes); } } arch_leave_lazy_mmu_mode(); if (force_flush) flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) spin_unlock(new_ptl); pte_unmap(new_ptep - 1); pte_unmap_unlock(old_ptep - 1, old_ptl); out: if (pmc->need_rmap_locks) drop_rmap_locks(vma); return err; } #ifndef arch_supports_page_table_move #define arch_supports_page_table_move arch_supports_page_table_move static inline bool arch_supports_page_table_move(void) { return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) || IS_ENABLED(CONFIG_HAVE_MOVE_PUD); } #endif static inline bool uffd_supports_page_table_move(struct pagetable_move_control *pmc) { /* * If we are moving a VMA that has uffd-wp registered but with * remap events disabled (new VMA will not be registered with uffd), we * need to ensure that the uffd-wp state is cleared from all pgtables. * This means recursing into lower page tables in move_page_tables(). * * We might get called with VMAs reversed when recovering from a * failed page table move. In that case, the * "old"-but-actually-"originally new" VMA during recovery will not have * a uffd context. Recursing into lower page tables during the original * move but not during the recovery move will cause trouble, because we * run into already-existing page tables. So check both VMAs. */ return !vma_has_uffd_without_event_remap(pmc->old) && !vma_has_uffd_without_event_remap(pmc->new); } #ifdef CONFIG_HAVE_MOVE_PMD static bool move_normal_pmd(struct pagetable_move_control *pmc, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; struct vm_area_struct *vma = pmc->old; struct mm_struct *mm = vma->vm_mm; bool res = false; pmd_t pmd; if (!arch_supports_page_table_move()) return false; if (!uffd_supports_page_table_move(pmc)) return false; /* * The destination pmd shouldn't be established, free_pgtables() * should have released it. * * However, there's a case during execve() where we use mremap * to move the initial stack, and in that case the target area * may overlap the source area (always moving down). * * If everything is PMD-aligned, that works fine, as moving * each pmd down will clear the source pmd. But if we first * have a few 4kB-only pages that get moved down, and then * hit the "now the rest is PMD-aligned, let's do everything * one pmd at a time", we will still have the old (now empty * of any 4kB pages, but still there) PMD in the page table * tree. * * Warn on it once - because we really should try to figure * out how to do this better - but then say "I won't move * this pmd". * * One alternative might be to just unmap the target pmd at * this point, and verify that it really is empty. We'll see. */ if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false; /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. */ old_ptl = pmd_lock(mm, old_pmd); new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = *old_pmd; /* Racing with collapse? */ if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd))) goto out_unlock; /* Clear the pmd */ pmd_clear(old_pmd); res = true; VM_BUG_ON(!pmd_none(*new_pmd)); pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE); out_unlock: if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); return res; } #else static inline bool move_normal_pmd(struct pagetable_move_control *pmc, pmd_t *old_pmd, pmd_t *new_pmd) { return false; } #endif #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD) static bool move_normal_pud(struct pagetable_move_control *pmc, pud_t *old_pud, pud_t *new_pud) { spinlock_t *old_ptl, *new_ptl; struct vm_area_struct *vma = pmc->old; struct mm_struct *mm = vma->vm_mm; pud_t pud; if (!arch_supports_page_table_move()) return false; if (!uffd_supports_page_table_move(pmc)) return false; /* * The destination pud shouldn't be established, free_pgtables() * should have released it. */ if (WARN_ON_ONCE(!pud_none(*new_pud))) return false; /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. */ old_ptl = pud_lock(mm, old_pud); new_ptl = pud_lockptr(mm, new_pud); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); /* Clear the pud */ pud = *old_pud; pud_clear(old_pud); VM_BUG_ON(!pud_none(*new_pud)); pud_populate(mm, new_pud, pud_pgtable(pud)); flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); return true; } #else static inline bool move_normal_pud(struct pagetable_move_control *pmc, pud_t *old_pud, pud_t *new_pud) { return false; } #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static bool move_huge_pud(struct pagetable_move_control *pmc, pud_t *old_pud, pud_t *new_pud) { spinlock_t *old_ptl, *new_ptl; struct vm_area_struct *vma = pmc->old; struct mm_struct *mm = vma->vm_mm; pud_t pud; /* * The destination pud shouldn't be established, free_pgtables() * should have released it. */ if (WARN_ON_ONCE(!pud_none(*new_pud))) return false; /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. */ old_ptl = pud_lock(mm, old_pud); new_ptl = pud_lockptr(mm, new_pud); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); /* Clear the pud */ pud = *old_pud; pud_clear(old_pud); VM_BUG_ON(!pud_none(*new_pud)); /* Set the new pud */ /* mark soft_ditry when we add pud level soft dirty support */ set_pud_at(mm, pmc->new_addr, new_pud, pud); flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); return true; } #else static bool move_huge_pud(struct pagetable_move_control *pmc, pud_t *old_pud, pud_t *new_pud) { WARN_ON_ONCE(1); return false; } #endif enum pgt_entry { NORMAL_PMD, HPAGE_PMD, NORMAL_PUD, HPAGE_PUD, }; /* * Returns an extent of the corresponding size for the pgt_entry specified if * valid. Else returns a smaller extent bounded by the end of the source and * destination pgt_entry. */ static __always_inline unsigned long get_extent(enum pgt_entry entry, struct pagetable_move_control *pmc) { unsigned long next, extent, mask, size; unsigned long old_addr = pmc->old_addr; unsigned long old_end = pmc->old_end; unsigned long new_addr = pmc->new_addr; switch (entry) { case HPAGE_PMD: case NORMAL_PMD: mask = PMD_MASK; size = PMD_SIZE; break; case HPAGE_PUD: case NORMAL_PUD: mask = PUD_MASK; size = PUD_SIZE; break; default: BUILD_BUG(); break; } next = (old_addr + size) & mask; /* even if next overflowed, extent below will be ok */ extent = next - old_addr; if (extent > old_end - old_addr) extent = old_end - old_addr; next = (new_addr + size) & mask; if (extent > next - new_addr) extent = next - new_addr; return extent; } /* * Should move_pgt_entry() acquire the rmap locks? This is either expressed in * the PMC, or overridden in the case of normal, larger page tables. */ static bool should_take_rmap_locks(struct pagetable_move_control *pmc, enum pgt_entry entry) { switch (entry) { case NORMAL_PMD: case NORMAL_PUD: return true; default: return pmc->need_rmap_locks; } } /* * Attempts to speedup the move by moving entry at the level corresponding to * pgt_entry. Returns true if the move was successful, else false. */ static bool move_pgt_entry(struct pagetable_move_control *pmc, enum pgt_entry entry, void *old_entry, void *new_entry) { bool moved = false; bool need_rmap_locks = should_take_rmap_locks(pmc, entry); /* See comment in move_ptes() */ if (need_rmap_locks) take_rmap_locks(pmc->old); switch (entry) { case NORMAL_PMD: moved = move_normal_pmd(pmc, old_entry, new_entry); break; case NORMAL_PUD: moved = move_normal_pud(pmc, old_entry, new_entry); break; case HPAGE_PMD: moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry, new_entry); break; case HPAGE_PUD: moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && move_huge_pud(pmc, old_entry, new_entry); break; default: WARN_ON_ONCE(1); break; } if (need_rmap_locks) drop_rmap_locks(pmc->old); return moved; } /* * A helper to check if aligning down is OK. The aligned address should fall * on *no mapping*. For the stack moving down, that's a special move within * the VMA that is created to span the source and destination of the move, * so we make an exception for it. */ static bool can_align_down(struct pagetable_move_control *pmc, struct vm_area_struct *vma, unsigned long addr_to_align, unsigned long mask) { unsigned long addr_masked = addr_to_align & mask; /* * If @addr_to_align of either source or destination is not the beginning * of the corresponding VMA, we can't align down or we will destroy part * of the current mapping. */ if (!pmc->for_stack && vma->vm_start != addr_to_align) return false; /* In the stack case we explicitly permit in-VMA alignment. */ if (pmc->for_stack && addr_masked >= vma->vm_start) return true; /* * Make sure the realignment doesn't cause the address to fall on an * existing mapping. */ return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL; } /* * Determine if are in fact able to realign for efficiency to a higher page * table boundary. */ static bool can_realign_addr(struct pagetable_move_control *pmc, unsigned long pagetable_mask) { unsigned long align_mask = ~pagetable_mask; unsigned long old_align = pmc->old_addr & align_mask; unsigned long new_align = pmc->new_addr & align_mask; unsigned long pagetable_size = align_mask + 1; unsigned long old_align_next = pagetable_size - old_align; /* * We don't want to have to go hunting for VMAs from the end of the old * VMA to the next page table boundary, also we want to make sure the * operation is wortwhile. * * So ensure that we only perform this realignment if the end of the * range being copied reaches or crosses the page table boundary. * * boundary boundary * .<- old_align -> . * . |----------------.-----------| * . | vma . | * . |----------------.-----------| * . <----------------.-----------> * . len_in * <-------------------------------> * . pagetable_size . * . <----------------> * . old_align_next . */ if (pmc->len_in < old_align_next) return false; /* Skip if the addresses are already aligned. */ if (old_align == 0) return false; /* Only realign if the new and old addresses are mutually aligned. */ if (old_align != new_align) return false; /* Ensure realignment doesn't cause overlap with existing mappings. */ if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) || !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask)) return false; return true; } /* * Opportunistically realign to specified boundary for faster copy. * * Consider an mremap() of a VMA with page table boundaries as below, and no * preceding VMAs from the lower page table boundary to the start of the VMA, * with the end of the range reaching or crossing the page table boundary. * * boundary boundary * . |----------------.-----------| * . | vma . | * . |----------------.-----------| * . pmc->old_addr . pmc->old_end * . <----------------------------> * . move these page tables * * If we proceed with moving page tables in this scenario, we will have a lot of * work to do traversing old page tables and establishing new ones in the * destination across multiple lower level page tables. * * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the * page table boundary, so we can simply copy a single page table entry for the * aligned portion of the VMA instead: * * boundary boundary * . |----------------.-----------| * . | vma . | * . |----------------.-----------| * pmc->old_addr . pmc->old_end * <-------------------------------------------> * . move these page tables */ static void try_realign_addr(struct pagetable_move_control *pmc, unsigned long pagetable_mask) { if (!can_realign_addr(pmc, pagetable_mask)) return; /* * Simply align to page table boundaries. Note that we do NOT update the * pmc->old_end value, and since the move_page_tables() operation spans * from [old_addr, old_end) (offsetting new_addr as it is performed), * this simply changes the start of the copy, not the end. */ pmc->old_addr &= pagetable_mask; pmc->new_addr &= pagetable_mask; } /* Is the page table move operation done? */ static bool pmc_done(struct pagetable_move_control *pmc) { return pmc->old_addr >= pmc->old_end; } /* Advance to the next page table, offset by extent bytes. */ static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent) { pmc->old_addr += extent; pmc->new_addr += extent; } /* * Determine how many bytes in the specified input range have had their page * tables moved so far. */ static unsigned long pmc_progress(struct pagetable_move_control *pmc) { unsigned long orig_old_addr = pmc->old_end - pmc->len_in; unsigned long old_addr = pmc->old_addr; /* * Prevent negative return values when {old,new}_addr was realigned but * we broke out of the loop in move_page_tables() for the first PMD * itself. */ return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr; } unsigned long move_page_tables(struct pagetable_move_control *pmc) { unsigned long extent; struct mmu_notifier_range range; pmd_t *old_pmd, *new_pmd; pud_t *old_pud, *new_pud; struct mm_struct *mm = pmc->old->vm_mm; if (!pmc->len_in) return 0; if (is_vm_hugetlb_page(pmc->old)) return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr, pmc->new_addr, pmc->len_in); /* * If possible, realign addresses to PMD boundary for faster copy. * Only realign if the mremap copying hits a PMD boundary. */ try_realign_addr(pmc, PMD_MASK); flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm, pmc->old_addr, pmc->old_end); mmu_notifier_invalidate_range_start(&range); for (; !pmc_done(pmc); pmc_next(pmc, extent)) { cond_resched(); /* * If extent is PUD-sized try to speed up the move by moving at the * PUD level if possible. */ extent = get_extent(NORMAL_PUD, pmc); old_pud = get_old_pud(mm, pmc->old_addr); if (!old_pud) continue; new_pud = alloc_new_pud(mm, pmc->new_addr); if (!new_pud) break; if (pud_trans_huge(*old_pud)) { if (extent == HPAGE_PUD_SIZE) { move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud); /* We ignore and continue on error? */ continue; } } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud)) continue; } extent = get_extent(NORMAL_PMD, pmc); old_pmd = get_old_pmd(mm, pmc->old_addr); if (!old_pmd) continue; new_pmd = alloc_new_pmd(mm, pmc->new_addr); if (!new_pmd) break; again: if (pmd_is_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE && move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd)) continue; split_huge_pmd(pmc->old, old_pmd, pmc->old_addr); } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && extent == PMD_SIZE) { /* * If the extent is PMD-sized, try to speed the move by * moving at the PMD level if possible. */ if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd)) continue; } if (pmd_none(*old_pmd)) continue; if (pte_alloc(pmc->new->vm_mm, new_pmd)) break; if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0) goto again; } mmu_notifier_invalidate_range_end(&range); return pmc_progress(pmc); } /* Set vrm->delta to the difference in VMA size specified by user. */ static void vrm_set_delta(struct vma_remap_struct *vrm) { vrm->delta = abs_diff(vrm->old_len, vrm->new_len); } /* Determine what kind of remap this is - shrink, expand or no resize at all. */ static enum mremap_type vrm_remap_type(struct vma_remap_struct *vrm) { if (vrm->delta == 0) return MREMAP_NO_RESIZE; if (vrm->old_len > vrm->new_len) return MREMAP_SHRINK; return MREMAP_EXPAND; } /* * When moving a VMA to vrm->new_adr, does this result in the new and old VMAs * overlapping? */ static bool vrm_overlaps(struct vma_remap_struct *vrm) { unsigned long start_old = vrm->addr; unsigned long start_new = vrm->new_addr; unsigned long end_old = vrm->addr + vrm->old_len; unsigned long end_new = vrm->new_addr + vrm->new_len; /* * start_old end_old * |-----------| * | | * |-----------| * |-------------| * | | * |-------------| * start_new end_new */ if (end_old > start_new && end_new > start_old) return true; return false; } /* * Will a new address definitely be assigned? This either if the user specifies * it via MREMAP_FIXED, or if MREMAP_DONTUNMAP is used, indicating we will * always detemrine a target address. */ static bool vrm_implies_new_addr(struct vma_remap_struct *vrm) { return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP); } /* * Find an unmapped area for the requested vrm->new_addr. * * If MREMAP_FIXED then this is equivalent to a MAP_FIXED mmap() call. If only * MREMAP_DONTUNMAP is set, then this is equivalent to providing a hint to * mmap(), otherwise this is equivalent to mmap() specifying a NULL address. * * Returns 0 on success (with vrm->new_addr updated), or an error code upon * failure. */ static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm) { struct vm_area_struct *vma = vrm->vma; unsigned long map_flags = 0; /* Page Offset _into_ the VMA. */ pgoff_t internal_pgoff = (vrm->addr - vma->vm_start) >> PAGE_SHIFT; pgoff_t pgoff = vma->vm_pgoff + internal_pgoff; unsigned long new_addr = vrm_implies_new_addr(vrm) ? vrm->new_addr : 0; unsigned long res; if (vrm->flags & MREMAP_FIXED) map_flags |= MAP_FIXED; if (vma->vm_flags & VM_MAYSHARE) map_flags |= MAP_SHARED; res = get_unmapped_area(vma->vm_file, new_addr, vrm->new_len, pgoff, map_flags); if (IS_ERR_VALUE(res)) return res; vrm->new_addr = res; return 0; } /* * Keep track of pages which have been added to the memory mapping. If the VMA * is accounted, also check to see if there is sufficient memory. * * Returns true on success, false if insufficient memory to charge. */ static bool vrm_calc_charge(struct vma_remap_struct *vrm) { unsigned long charged; if (!(vrm->vma->vm_flags & VM_ACCOUNT)) return true; /* * If we don't unmap the old mapping, then we account the entirety of * the length of the new one. Otherwise it's just the delta in size. */ if (vrm->flags & MREMAP_DONTUNMAP) charged = vrm->new_len >> PAGE_SHIFT; else charged = vrm->delta >> PAGE_SHIFT; /* This accounts 'charged' pages of memory. */ if (security_vm_enough_memory_mm(current->mm, charged)) return false; vrm->charged = charged; return true; } /* * an error has occurred so we will not be using vrm->charged memory. Unaccount * this memory if the VMA is accounted. */ static void vrm_uncharge(struct vma_remap_struct *vrm) { if (!(vrm->vma->vm_flags & VM_ACCOUNT)) return; vm_unacct_memory(vrm->charged); vrm->charged = 0; } /* * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to * account for 'bytes' memory used, and if locked, indicate this in the VRM so * we can handle this correctly later. */ static void vrm_stat_account(struct vma_remap_struct *vrm, unsigned long bytes) { unsigned long pages = bytes >> PAGE_SHIFT; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = vrm->vma; vm_stat_account(mm, vma->vm_flags, pages); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += pages; } /* * Perform checks before attempting to write a VMA prior to it being * moved. */ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) { unsigned long err = 0; struct vm_area_struct *vma = vrm->vma; unsigned long old_addr = vrm->addr; unsigned long old_len = vrm->old_len; vm_flags_t dummy = vma->vm_flags; /* * We'd prefer to avoid failure later on in do_munmap: * which may split one vma into three before unmapping. */ if (current->mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; if (vma->vm_ops && vma->vm_ops->may_split) { if (vma->vm_start != old_addr) err = vma->vm_ops->may_split(vma, old_addr); if (!err && vma->vm_end != old_addr + old_len) err = vma->vm_ops->may_split(vma, old_addr + old_len); if (err) return err; } /* * Advise KSM to break any KSM pages in the area to be moved: * it would be confusing if they were to turn up at the new * location, where they happen to coincide with different KSM * pages recently unmapped. But leave vma->vm_flags as it was, * so KSM can come around to merge on vma and new_vma afterwards. */ err = ksm_madvise(vma, old_addr, old_addr + old_len, MADV_UNMERGEABLE, &dummy); if (err) return err; return 0; } /* * Unmap source VMA for VMA move, turning it from a copy to a move, being * careful to ensure we do not underflow memory account while doing so if an * accountable move. * * This is best effort, if we fail to unmap then we simply try to correct * accounting and exit. */ static void unmap_source_vma(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; unsigned long addr = vrm->addr; unsigned long len = vrm->old_len; struct vm_area_struct *vma = vrm->vma; VMA_ITERATOR(vmi, mm, addr); int err; unsigned long vm_start; unsigned long vm_end; /* * It might seem odd that we check for MREMAP_DONTUNMAP here, given this * function implies that we unmap the original VMA, which seems * contradictory. * * However, this occurs when this operation was attempted and an error * arose, in which case we _do_ wish to unmap the _new_ VMA, which means * we actually _do_ want it be unaccounted. */ bool accountable_move = (vma->vm_flags & VM_ACCOUNT) && !(vrm->flags & MREMAP_DONTUNMAP); /* * So we perform a trick here to prevent incorrect accounting. Any merge * or new VMA allocation performed in copy_vma() does not adjust * accounting, it is expected that callers handle this. * * And indeed we already have, accounting appropriately in the case of * both in vrm_charge(). * * However, when we unmap the existing VMA (to effect the move), this * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount * removed pages. * * To avoid this we temporarily clear this flag, reinstating on any * portions of the original VMA that remain. */ if (accountable_move) { vm_flags_clear(vma, VM_ACCOUNT); /* We are about to split vma, so store the start/end. */ vm_start = vma->vm_start; vm_end = vma->vm_end; } err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false); vrm->vma = NULL; /* Invalidated. */ vrm->vmi_needs_invalidate = true; if (err) { /* OOM: unable to split vma, just get accounts right */ vm_acct_memory(len >> PAGE_SHIFT); return; } /* * If we mremap() from a VMA like this: * * addr end * | | * v v * |-------------| * | | * |-------------| * * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above * we'll end up with: * * addr end * | | * v v * |---| |---| * | A | | B | * |---| |---| * * The VMI is still pointing at addr, so vma_prev() will give us A, and * a subsequent or lone vma_next() will give as B. * * do_vmi_munmap() will have restored the VMI back to addr. */ if (accountable_move) { unsigned long end = addr + len; if (vm_start < addr) { struct vm_area_struct *prev = vma_prev(&vmi); vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */ } if (vm_end > end) { struct vm_area_struct *next = vma_next(&vmi); vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */ } } } /* * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the * process. Additionally handle an error occurring on moving of page tables, * where we reset vrm state to cause unmapping of the new VMA. * * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an * error code. */ static int copy_vma_and_data(struct vma_remap_struct *vrm, struct vm_area_struct **new_vma_ptr) { unsigned long internal_offset = vrm->addr - vrm->vma->vm_start; unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT; unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff; unsigned long moved_len; struct vm_area_struct *vma = vrm->vma; struct vm_area_struct *new_vma; int err = 0; PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len); new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff, &pmc.need_rmap_locks); if (!new_vma) { vrm_uncharge(vrm); *new_vma_ptr = NULL; return -ENOMEM; } /* By merging, we may have invalidated any iterator in use. */ if (vma != vrm->vma) vrm->vmi_needs_invalidate = true; vrm->vma = vma; pmc.old = vma; pmc.new = new_vma; moved_len = move_page_tables(&pmc); if (moved_len < vrm->old_len) err = -ENOMEM; else if (vma->vm_ops && vma->vm_ops->mremap) err = vma->vm_ops->mremap(new_vma); if (unlikely(err)) { PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr, vrm->addr, moved_len); /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. */ pmc_revert.need_rmap_locks = true; move_page_tables(&pmc_revert); vrm->vma = new_vma; vrm->old_len = vrm->new_len; vrm->addr = vrm->new_addr; } else { mremap_userfaultfd_prep(new_vma, vrm->uf); } fixup_hugetlb_reservations(vma); *new_vma_ptr = new_vma; return err; } /* * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() flag on * remaining VMA by convention (it cannot be mlock()'d any longer, as pages in * range are no longer mapped), and removing anon_vma_chain links from it if the * entire VMA was copied over. */ static void dontunmap_complete(struct vma_remap_struct *vrm, struct vm_area_struct *new_vma) { unsigned long start = vrm->addr; unsigned long end = vrm->addr + vrm->old_len; unsigned long old_start = vrm->vma->vm_start; unsigned long old_end = vrm->vma->vm_end; /* We always clear VM_LOCKED[ONFAULT] on the old VMA. */ vm_flags_clear(vrm->vma, VM_LOCKED_MASK); /* * anon_vma links of the old vma is no longer needed after its page * table has been moved. */ if (new_vma != vrm->vma && start == old_start && end == old_end) unlink_anon_vmas(vrm->vma); /* Because we won't unmap we don't need to touch locked_vm. */ } static unsigned long move_vma(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; struct vm_area_struct *new_vma; unsigned long hiwater_vm; int err; err = prep_move_vma(vrm); if (err) return err; /* * If accounted, determine the number of bytes the operation will * charge. */ if (!vrm_calc_charge(vrm)) return -ENOMEM; /* We don't want racing faults. */ vma_start_write(vrm->vma); /* Perform copy step. */ err = copy_vma_and_data(vrm, &new_vma); /* * If we established the copied-to VMA, we attempt to recover from the * error by setting the destination VMA to the source VMA and unmapping * it below. */ if (err && !new_vma) return err; /* * If we failed to move page tables we still do total_vm increment * since do_munmap() will decrement it by old_len == new_len. * * Since total_vm is about to be raised artificially high for a * moment, we need to restore high watermark afterwards: if stats * are taken meanwhile, total_vm and hiwater_vm appear too high. * If this were a serious issue, we'd add a flag to do_munmap(). */ hiwater_vm = mm->hiwater_vm; vrm_stat_account(vrm, vrm->new_len); if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP))) dontunmap_complete(vrm, new_vma); else unmap_source_vma(vrm); mm->hiwater_vm = hiwater_vm; return err ? (unsigned long)err : vrm->new_addr; } /* * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so * execute this, optionally dropping the mmap lock when we do so. * * In both cases this invalidates the VMA, however if we don't drop the lock, * then load the correct VMA into vrm->vma afterwards. */ static unsigned long shrink_vma(struct vma_remap_struct *vrm, bool drop_lock) { struct mm_struct *mm = current->mm; unsigned long unmap_start = vrm->addr + vrm->new_len; unsigned long unmap_bytes = vrm->delta; unsigned long res; VMA_ITERATOR(vmi, mm, unmap_start); VM_BUG_ON(vrm->remap_type != MREMAP_SHRINK); res = do_vmi_munmap(&vmi, mm, unmap_start, unmap_bytes, vrm->uf_unmap, drop_lock); vrm->vma = NULL; /* Invalidated. */ if (res) return res; /* * If we've not dropped the lock, then we should reload the VMA to * replace the invalidated VMA with the one that may have now been * split. */ if (drop_lock) { vrm->mmap_locked = false; } else { vrm->vma = vma_lookup(mm, vrm->addr); if (!vrm->vma) return -EFAULT; } return 0; } /* * mremap_to() - remap a vma to a new location. * Returns: The new address of the vma or an error. */ static unsigned long mremap_to(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; unsigned long err; if (vrm->flags & MREMAP_FIXED) { /* * In mremap_to(). * VMA is moved to dst address, and munmap dst first. * do_munmap will check if dst is sealed. */ err = do_munmap(mm, vrm->new_addr, vrm->new_len, vrm->uf_unmap_early); vrm->vma = NULL; /* Invalidated. */ vrm->vmi_needs_invalidate = true; if (err) return err; /* * If we remap a portion of a VMA elsewhere in the same VMA, * this can invalidate the old VMA. Reset. */ vrm->vma = vma_lookup(mm, vrm->addr); if (!vrm->vma) return -EFAULT; } if (vrm->remap_type == MREMAP_SHRINK) { err = shrink_vma(vrm, /* drop_lock= */false); if (err) return err; /* Set up for the move now shrink has been executed. */ vrm->old_len = vrm->new_len; } /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (vrm->flags & MREMAP_DONTUNMAP) { vm_flags_t vm_flags = vrm->vma->vm_flags; unsigned long pages = vrm->old_len >> PAGE_SHIFT; if (!may_expand_vm(mm, vm_flags, pages)) return -ENOMEM; } err = vrm_set_new_addr(vrm); if (err) return err; return move_vma(vrm); } static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) { unsigned long end = vma->vm_end + delta; if (end < vma->vm_end) /* overflow */ return 0; if (find_vma_intersection(vma->vm_mm, vma->vm_end, end)) return 0; if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, 0, MAP_FIXED) & ~PAGE_MASK) return 0; return 1; } /* Determine whether we are actually able to execute an in-place expansion. */ static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm) { /* Number of bytes from vrm->addr to end of VMA. */ unsigned long suffix_bytes = vrm->vma->vm_end - vrm->addr; /* If end of range aligns to end of VMA, we can just expand in-place. */ if (suffix_bytes != vrm->old_len) return false; /* Check whether this is feasible. */ if (!vma_expandable(vrm->vma, vrm->delta)) return false; return true; } /* * We know we can expand the VMA in-place by delta pages, so do so. * * If we discover the VMA is locked, update mm_struct statistics accordingly and * indicate so to the caller. */ static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = vrm->vma; VMA_ITERATOR(vmi, mm, vma->vm_end); if (!vrm_calc_charge(vrm)) return -ENOMEM; /* * Function vma_merge_extend() is called on the * extension we are adding to the already existing vma, * vma_merge_extend() will merge this extension with the * already existing vma (expand operation itself) and * possibly also with the next vma if it becomes * adjacent to the expanded vma and otherwise * compatible. */ vma = vma_merge_extend(&vmi, vma, vrm->delta); if (!vma) { vrm_uncharge(vrm); return -ENOMEM; } vrm->vma = vma; vrm_stat_account(vrm, vrm->delta); return 0; } static bool align_hugetlb(struct vma_remap_struct *vrm) { struct hstate *h __maybe_unused = hstate_vma(vrm->vma); vrm->old_len = ALIGN(vrm->old_len, huge_page_size(h)); vrm->new_len = ALIGN(vrm->new_len, huge_page_size(h)); /* addrs must be huge page aligned */ if (vrm->addr & ~huge_page_mask(h)) return false; if (vrm->new_addr & ~huge_page_mask(h)) return false; /* * Don't allow remap expansion, because the underlying hugetlb * reservation is not yet capable to handle split reservation. */ if (vrm->new_len > vrm->old_len) return false; return true; } /* * We are mremap()'ing without specifying a fixed address to move to, but are * requesting that the VMA's size be increased. * * Try to do so in-place, if this fails, then move the VMA to a new location to * action the change. */ static unsigned long expand_vma(struct vma_remap_struct *vrm) { unsigned long err; /* * [addr, old_len) spans precisely to the end of the VMA, so try to * expand it in-place. */ if (vrm_can_expand_in_place(vrm)) { err = expand_vma_in_place(vrm); if (err) return err; /* OK we're done! */ return vrm->addr; } /* * We weren't able to just expand or shrink the area, * we need to create a new one and move it. */ /* We're not allowed to move the VMA, so error out. */ if (!(vrm->flags & MREMAP_MAYMOVE)) return -ENOMEM; /* Find a new location to move the VMA to. */ err = vrm_set_new_addr(vrm); if (err) return err; return move_vma(vrm); } /* * Attempt to resize the VMA in-place, if we cannot, then move the VMA to the * first available address to perform the operation. */ static unsigned long mremap_at(struct vma_remap_struct *vrm) { unsigned long res; switch (vrm->remap_type) { case MREMAP_INVALID: break; case MREMAP_NO_RESIZE: /* NO-OP CASE - resizing to the same size. */ return vrm->addr; case MREMAP_SHRINK: /* * SHRINK CASE. Can always be done in-place. * * Simply unmap the shrunken portion of the VMA. This does all * the needed commit accounting, and we indicate that the mmap * lock should be dropped. */ res = shrink_vma(vrm, /* drop_lock= */true); if (res) return res; return vrm->addr; case MREMAP_EXPAND: return expand_vma(vrm); } /* Should not be possible. */ WARN_ON_ONCE(1); return -EINVAL; } /* * Will this operation result in the VMA being expanded or moved and thus need * to map a new portion of virtual address space? */ static bool vrm_will_map_new(struct vma_remap_struct *vrm) { if (vrm->remap_type == MREMAP_EXPAND) return true; if (vrm_implies_new_addr(vrm)) return true; return false; } /* Does this remap ONLY move mappings? */ static bool vrm_move_only(struct vma_remap_struct *vrm) { if (!(vrm->flags & MREMAP_FIXED)) return false; if (vrm->old_len != vrm->new_len) return false; return true; } static void notify_uffd(struct vma_remap_struct *vrm, bool failed) { struct mm_struct *mm = current->mm; /* Regardless of success/failure, we always notify of any unmaps. */ userfaultfd_unmap_complete(mm, vrm->uf_unmap_early); if (failed) mremap_userfaultfd_fail(vrm->uf); else mremap_userfaultfd_complete(vrm->uf, vrm->addr, vrm->new_addr, vrm->old_len); userfaultfd_unmap_complete(mm, vrm->uf_unmap); } static bool vma_multi_allowed(struct vm_area_struct *vma) { struct file *file = vma->vm_file; /* * We can't support moving multiple uffd VMAs as notify requires * mmap lock to be dropped. */ if (userfaultfd_armed(vma)) return false; /* * Custom get unmapped area might result in MREMAP_FIXED not * being obeyed. */ if (!file || !file->f_op->get_unmapped_area) return true; /* Known good. */ if (vma_is_shmem(vma)) return true; if (is_vm_hugetlb_page(vma)) return true; if (file->f_op->get_unmapped_area == thp_get_unmapped_area) return true; return false; } static int check_prep_vma(struct vma_remap_struct *vrm) { struct vm_area_struct *vma = vrm->vma; struct mm_struct *mm = current->mm; unsigned long addr = vrm->addr; unsigned long old_len, new_len, pgoff; if (!vma) return -EFAULT; /* If mseal()'d, mremap() is prohibited. */ if (vma_is_sealed(vma)) return -EPERM; /* Align to hugetlb page size, if required. */ if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) return -EINVAL; vrm_set_delta(vrm); vrm->remap_type = vrm_remap_type(vrm); /* For convenience, we set new_addr even if VMA won't move. */ if (!vrm_implies_new_addr(vrm)) vrm->new_addr = addr; /* Below only meaningful if we expand or move a VMA. */ if (!vrm_will_map_new(vrm)) return 0; old_len = vrm->old_len; new_len = vrm->new_len; /* * !old_len is a special case where an attempt is made to 'duplicate' * a mapping. This makes no sense for private mappings as it will * instead create a fresh/new mapping unrelated to the original. This * is contrary to the basic idea of mremap which creates new mappings * based on the original. There are no known use cases for this * behavior. As a result, fail such attempts. */ if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid); return -EINVAL; } if ((vrm->flags & MREMAP_DONTUNMAP) && (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) return -EINVAL; /* * We permit crossing of boundaries for the range being unmapped due to * a shrink. */ if (vrm->remap_type == MREMAP_SHRINK) old_len = new_len; /* * We can't remap across the end of VMAs, as another VMA may be * adjacent: * * addr vma->vm_end * |-----.----------| * | . | * |-----.----------| * .<--------->xxx> * old_len * * We also require that vma->vm_start <= addr < vma->vm_end. */ if (old_len > vma->vm_end - addr) return -EFAULT; if (new_len == old_len) return 0; /* We are expanding and the VMA is mlock()'d so we need to populate. */ if (vma->vm_flags & VM_LOCKED) vrm->populate_expand = true; /* Need to be careful about a growing mapping */ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) return -EINVAL; if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) return -EFAULT; if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta)) return -EAGAIN; if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) return -ENOMEM; return 0; } /* * Are the parameters passed to mremap() valid? If so return 0, otherwise return * error. */ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) { unsigned long addr = vrm->addr; unsigned long flags = vrm->flags; /* Ensure no unexpected flag values. */ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) return -EINVAL; /* Start address must be page-aligned. */ if (offset_in_page(addr)) return -EINVAL; /* * We allow a zero old-len as a special case * for DOS-emu "duplicate shm area" thing. But * a zero new-len is nonsensical. */ if (!vrm->new_len) return -EINVAL; /* Is the new length silly? */ if (vrm->new_len > TASK_SIZE) return -EINVAL; /* Remainder of checks are for cases with specific new_addr. */ if (!vrm_implies_new_addr(vrm)) return 0; /* Is the new address silly? */ if (vrm->new_addr > TASK_SIZE - vrm->new_len) return -EINVAL; /* The new address must be page-aligned. */ if (offset_in_page(vrm->new_addr)) return -EINVAL; /* A fixed address implies a move. */ if (!(flags & MREMAP_MAYMOVE)) return -EINVAL; /* MREMAP_DONTUNMAP does not allow resizing in the process. */ if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len) return -EINVAL; /* Target VMA must not overlap source VMA. */ if (vrm_overlaps(vrm)) return -EINVAL; /* * move_vma() need us to stay 4 maps below the threshold, otherwise * it will bail out at the very beginning. * That is a problem if we have already unmaped the regions here * (new_addr, and old_addr), because userspace will not know the * state of the vma's after it gets -ENOMEM. * So, to avoid such scenario we can pre-compute if the whole * operation has high chances to success map-wise. * Worst-scenario case is when both vma's (new_addr and old_addr) get * split in 3 before unmapping it. * That means 2 more maps (1 for each) to the ones we already hold. * Check whether current map count plus 2 still leads us to 4 maps below * the threshold, otherwise return -ENOMEM here to be more safe. */ if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) return -ENOMEM; return 0; } static unsigned long remap_move(struct vma_remap_struct *vrm) { struct vm_area_struct *vma; unsigned long start = vrm->addr; unsigned long end = vrm->addr + vrm->old_len; unsigned long new_addr = vrm->new_addr; unsigned long target_addr = new_addr; unsigned long res = -EFAULT; unsigned long last_end; bool seen_vma = false; VMA_ITERATOR(vmi, current->mm, start); /* * When moving VMAs we allow for batched moves across multiple VMAs, * with all VMAs in the input range [addr, addr + old_len) being moved * (and split as necessary). */ for_each_vma_range(vmi, vma, end) { /* Account for start, end not aligned with VMA start, end. */ unsigned long addr = max(vma->vm_start, start); unsigned long len = min(end, vma->vm_end) - addr; unsigned long offset, res_vma; bool multi_allowed; /* No gap permitted at the start of the range. */ if (!seen_vma && start < vma->vm_start) return -EFAULT; /* * To sensibly move multiple VMAs, accounting for the fact that * get_unmapped_area() may align even MAP_FIXED moves, we simply * attempt to move such that the gaps between source VMAs remain * consistent in destination VMAs, e.g.: * * X Y X Y * <---> <-> <---> <-> * |-------| |-----| |-----| |-------| |-----| |-----| * | A | | B | | C | ---> | A' | | B' | | C' | * |-------| |-----| |-----| |-------| |-----| |-----| * new_addr * * So we map B' at A'->vm_end + X, and C' at B'->vm_end + Y. */ offset = seen_vma ? vma->vm_start - last_end : 0; last_end = vma->vm_end; vrm->vma = vma; vrm->addr = addr; vrm->new_addr = target_addr + offset; vrm->old_len = vrm->new_len = len; multi_allowed = vma_multi_allowed(vma); if (!multi_allowed) { /* This is not the first VMA, abort immediately. */ if (seen_vma) return -EFAULT; /* This is the first, but there are more, abort. */ if (vma->vm_end < end) return -EFAULT; } res_vma = check_prep_vma(vrm); if (!res_vma) res_vma = mremap_to(vrm); if (IS_ERR_VALUE(res_vma)) return res_vma; if (!seen_vma) { VM_WARN_ON_ONCE(multi_allowed && res_vma != new_addr); res = res_vma; } /* mmap lock is only dropped on shrink. */ VM_WARN_ON_ONCE(!vrm->mmap_locked); /* This is a move, no expand should occur. */ VM_WARN_ON_ONCE(vrm->populate_expand); if (vrm->vmi_needs_invalidate) { vma_iter_invalidate(&vmi); vrm->vmi_needs_invalidate = false; } seen_vma = true; target_addr = res_vma + vrm->new_len; } return res; } static unsigned long do_mremap(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; unsigned long res; bool failed; vrm->old_len = PAGE_ALIGN(vrm->old_len); vrm->new_len = PAGE_ALIGN(vrm->new_len); res = check_mremap_params(vrm); if (res) return res; if (mmap_write_lock_killable(mm)) return -EINTR; vrm->mmap_locked = true; if (vrm_move_only(vrm)) { res = remap_move(vrm); } else { vrm->vma = vma_lookup(current->mm, vrm->addr); res = check_prep_vma(vrm); if (res) goto out; /* Actually execute mremap. */ res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm); } out: failed = IS_ERR_VALUE(res); if (vrm->mmap_locked) mmap_write_unlock(mm); /* VMA mlock'd + was expanded, so populated expanded region. */ if (!failed && vrm->populate_expand) mm_populate(vrm->new_addr + vrm->old_len, vrm->delta); notify_uffd(vrm, failed); return res; } /* * Expand (or shrink) an existing mapping, potentially moving it at the * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) * * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise * This option implies MREMAP_MAYMOVE. */ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long, new_len, unsigned long, flags, unsigned long, new_addr) { struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); /* * There is a deliberate asymmetry here: we strip the pointer tag * from the old address but leave the new address alone. This is * for consistency with mmap(), where we prevent the creation of * aliasing mappings in userspace by leaving the tag bits of the * mapping address intact. A non-zero tag will cause the subsequent * range checks to reject the address as invalid. * * See Documentation/arch/arm64/tagged-address-abi.rst for more * information. */ struct vma_remap_struct vrm = { .addr = untagged_addr(addr), .old_len = old_len, .new_len = new_len, .flags = flags, .new_addr = new_addr, .uf = &uf, .uf_unmap_early = &uf_unmap_early, .uf_unmap = &uf_unmap, .remap_type = MREMAP_INVALID, /* We set later. */ }; return do_mremap(&vrm); }
558 558 558 1 558 558 1 8 245 446 447 447 447 228 228 228 227 461 460 462 441 4 1 462 462 4 462 4 257 257 129 5 255 257 129 215 58 1387 1388 1388 228 228 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 // SPDX-License-Identifier: GPL-2.0-only #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/list_bl.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/mbcache.h> /* * Mbcache is a simple key-value store. Keys need not be unique, however * key-value pairs are expected to be unique (we use this fact in * mb_cache_entry_delete_or_get()). * * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. * Ext4 also uses it for deduplication of xattr values stored in inodes. * They use hash of data as a key and provide a value that may represent a * block or inode number. That's why keys need not be unique (hash of different * data may be the same). However user provided value always uniquely * identifies a cache entry. * * We provide functions for creation and removal of entries, search by key, * and a special "delete entry with given key-value pair" operation. Fixed * size hash table is used for fast key lookups. */ struct mb_cache { /* Hash table of entries */ struct hlist_bl_head *c_hash; /* log2 of hash table size */ int c_bucket_bits; /* Maximum entries in cache to avoid degrading hash too much */ unsigned long c_max_entries; /* Protects c_list, c_entry_count */ spinlock_t c_list_lock; struct list_head c_list; /* Number of entries in cache */ unsigned long c_entry_count; struct shrinker *c_shrink; /* Work for shrinking when the cache has too many entries */ struct work_struct c_shrink_work; }; static struct kmem_cache *mb_entry_cache; static unsigned long mb_cache_shrink(struct mb_cache *cache, unsigned long nr_to_scan); static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, u32 key) { return &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; } /* * Number of entries to reclaim synchronously when there are too many entries * in cache */ #define SYNC_SHRINK_BATCH 64 /* * mb_cache_entry_create - create entry in cache * @cache - cache where the entry should be created * @mask - gfp mask with which the entry should be allocated * @key - key of the entry * @value - value of the entry * @reusable - is the entry reusable by others? * * Creates entry in @cache with key @key and value @value. The function returns * -EBUSY if entry with the same key and value already exists in cache. * Otherwise 0 is returned. */ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, u64 value, bool reusable) { struct mb_cache_entry *entry, *dup; struct hlist_bl_node *dup_node; struct hlist_bl_head *head; /* Schedule background reclaim if there are too many entries */ if (cache->c_entry_count >= cache->c_max_entries) schedule_work(&cache->c_shrink_work); /* Do some sync reclaim if background reclaim cannot keep up */ if (cache->c_entry_count >= 2*cache->c_max_entries) mb_cache_shrink(cache, SYNC_SHRINK_BATCH); entry = kmem_cache_alloc(mb_entry_cache, mask); if (!entry) return -ENOMEM; INIT_LIST_HEAD(&entry->e_list); /* * We create entry with two references. One reference is kept by the * hash table, the other reference is used to protect us from * mb_cache_entry_delete_or_get() until the entry is fully setup. This * avoids nesting of cache->c_list_lock into hash table bit locks which * is problematic for RT. */ atomic_set(&entry->e_refcnt, 2); entry->e_key = key; entry->e_value = value; entry->e_flags = 0; if (reusable) set_bit(MBE_REUSABLE_B, &entry->e_flags); head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { if (dup->e_key == key && dup->e_value == value) { hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); return -EBUSY; } } hlist_bl_add_head(&entry->e_hash_list, head); hlist_bl_unlock(head); spin_lock(&cache->c_list_lock); list_add_tail(&entry->e_list, &cache->c_list); cache->c_entry_count++; spin_unlock(&cache->c_list_lock); mb_cache_entry_put(cache, entry); return 0; } EXPORT_SYMBOL(mb_cache_entry_create); void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry) { struct hlist_bl_head *head; head = mb_cache_entry_head(cache, entry->e_key); hlist_bl_lock(head); hlist_bl_del(&entry->e_hash_list); hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); } EXPORT_SYMBOL(__mb_cache_entry_free); /* * mb_cache_entry_wait_unused - wait to be the last user of the entry * * @entry - entry to work on * * Wait to be the last user of the entry. */ void mb_cache_entry_wait_unused(struct mb_cache_entry *entry) { wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2); } EXPORT_SYMBOL(mb_cache_entry_wait_unused); static struct mb_cache_entry *__entry_find(struct mb_cache *cache, struct mb_cache_entry *entry, u32 key) { struct mb_cache_entry *old_entry = entry; struct hlist_bl_node *node; struct hlist_bl_head *head; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) node = entry->e_hash_list.next; else node = hlist_bl_first(head); while (node) { entry = hlist_bl_entry(node, struct mb_cache_entry, e_hash_list); if (entry->e_key == key && test_bit(MBE_REUSABLE_B, &entry->e_flags) && atomic_inc_not_zero(&entry->e_refcnt)) goto out; node = node->next; } entry = NULL; out: hlist_bl_unlock(head); if (old_entry) mb_cache_entry_put(cache, old_entry); return entry; } /* * mb_cache_entry_find_first - find the first reusable entry with the given key * @cache: cache where we should search * @key: key to look for * * Search in @cache for a reusable entry with key @key. Grabs reference to the * first reusable entry found and returns the entry. */ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key) { return __entry_find(cache, NULL, key); } EXPORT_SYMBOL(mb_cache_entry_find_first); /* * mb_cache_entry_find_next - find next reusable entry with the same key * @cache: cache where we should search * @entry: entry to start search from * * Finds next reusable entry in the hash chain which has the same key as @entry. * If @entry is unhashed (which can happen when deletion of entry races with the * search), finds the first reusable entry in the hash chain. The function drops * reference to @entry and returns with a reference to the found entry. */ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry) { return __entry_find(cache, entry, entry->e_key); } EXPORT_SYMBOL(mb_cache_entry_find_next); /* * mb_cache_entry_get - get a cache entry by value (and key) * @cache - cache we work with * @key - key * @value - value */ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, u64 value) { struct hlist_bl_node *node; struct hlist_bl_head *head; struct mb_cache_entry *entry; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { if (entry->e_key == key && entry->e_value == value && atomic_inc_not_zero(&entry->e_refcnt)) goto out; } entry = NULL; out: hlist_bl_unlock(head); return entry; } EXPORT_SYMBOL(mb_cache_entry_get); /* mb_cache_entry_delete_or_get - remove a cache entry if it has no users * @cache - cache we work with * @key - key * @value - value * * Remove entry from cache @cache with key @key and value @value. The removal * happens only if the entry is unused. The function returns NULL in case the * entry was successfully removed or there's no entry in cache. Otherwise the * function grabs reference of the entry that we failed to delete because it * still has users and return it. */ struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache, u32 key, u64 value) { struct mb_cache_entry *entry; entry = mb_cache_entry_get(cache, key, value); if (!entry) return NULL; /* * Drop the ref we got from mb_cache_entry_get() and the initial hash * ref if we are the last user */ if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2) return entry; spin_lock(&cache->c_list_lock); if (!list_empty(&entry->e_list)) list_del_init(&entry->e_list); cache->c_entry_count--; spin_unlock(&cache->c_list_lock); __mb_cache_entry_free(cache, entry); return NULL; } EXPORT_SYMBOL(mb_cache_entry_delete_or_get); /* mb_cache_entry_touch - cache entry got used * @cache - cache the entry belongs to * @entry - entry that got used * * Marks entry as used to give hit higher chances of surviving in cache. */ void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry) { set_bit(MBE_REFERENCED_B, &entry->e_flags); } EXPORT_SYMBOL(mb_cache_entry_touch); static unsigned long mb_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct mb_cache *cache = shrink->private_data; return cache->c_entry_count; } /* Shrink number of entries in cache */ static unsigned long mb_cache_shrink(struct mb_cache *cache, unsigned long nr_to_scan) { struct mb_cache_entry *entry; unsigned long shrunk = 0; spin_lock(&cache->c_list_lock); while (nr_to_scan-- && !list_empty(&cache->c_list)) { entry = list_first_entry(&cache->c_list, struct mb_cache_entry, e_list); /* Drop initial hash reference if there is no user */ if (test_bit(MBE_REFERENCED_B, &entry->e_flags) || atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) { clear_bit(MBE_REFERENCED_B, &entry->e_flags); list_move_tail(&entry->e_list, &cache->c_list); continue; } list_del_init(&entry->e_list); cache->c_entry_count--; spin_unlock(&cache->c_list_lock); __mb_cache_entry_free(cache, entry); shrunk++; cond_resched(); spin_lock(&cache->c_list_lock); } spin_unlock(&cache->c_list_lock); return shrunk; } static unsigned long mb_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct mb_cache *cache = shrink->private_data; return mb_cache_shrink(cache, sc->nr_to_scan); } /* We shrink 1/X of the cache when we have too many entries in it */ #define SHRINK_DIVISOR 16 static void mb_cache_shrink_worker(struct work_struct *work) { struct mb_cache *cache = container_of(work, struct mb_cache, c_shrink_work); mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); } /* * mb_cache_create - create cache * @bucket_bits: log2 of the hash table size * * Create cache for keys with 2^bucket_bits hash entries. */ struct mb_cache *mb_cache_create(int bucket_bits) { struct mb_cache *cache; unsigned long bucket_count = 1UL << bucket_bits; unsigned long i; cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); if (!cache) goto err_out; cache->c_bucket_bits = bucket_bits; cache->c_max_entries = bucket_count << 4; INIT_LIST_HEAD(&cache->c_list); spin_lock_init(&cache->c_list_lock); cache->c_hash = kmalloc_array(bucket_count, sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_hash) { kfree(cache); goto err_out; } for (i = 0; i < bucket_count; i++) INIT_HLIST_BL_HEAD(&cache->c_hash[i]); cache->c_shrink = shrinker_alloc(0, "mbcache-shrinker"); if (!cache->c_shrink) { kfree(cache->c_hash); kfree(cache); goto err_out; } cache->c_shrink->count_objects = mb_cache_count; cache->c_shrink->scan_objects = mb_cache_scan; cache->c_shrink->private_data = cache; shrinker_register(cache->c_shrink); INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); return cache; err_out: return NULL; } EXPORT_SYMBOL(mb_cache_create); /* * mb_cache_destroy - destroy cache * @cache: the cache to destroy * * Free all entries in cache and cache itself. Caller must make sure nobody * (except shrinker) can reach @cache when calling this. */ void mb_cache_destroy(struct mb_cache *cache) { struct mb_cache_entry *entry, *next; shrinker_free(cache->c_shrink); /* * We don't bother with any locking. Cache must not be used at this * point. */ list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { list_del(&entry->e_list); WARN_ON(atomic_read(&entry->e_refcnt) != 1); mb_cache_entry_put(cache, entry); } kfree(cache->c_hash); kfree(cache); } EXPORT_SYMBOL(mb_cache_destroy); static int __init mbcache_init(void) { mb_entry_cache = KMEM_CACHE(mb_cache_entry, SLAB_RECLAIM_ACCOUNT); if (!mb_entry_cache) return -ENOMEM; return 0; } static void __exit mbcache_exit(void) { kmem_cache_destroy(mb_entry_cache); } module_init(mbcache_init) module_exit(mbcache_exit) MODULE_AUTHOR("Jan Kara <jack@suse.cz>"); MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); MODULE_LICENSE("GPL");
76 1344 993 11694 3420 2747 46 6973 41 19 5 4 6 5 696 636 17 17 1 618 2307 255 255 1308 12879 12883 1215 1453 84 213 3934 210 8 1 211 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGTABLE_H #define _LINUX_PGTABLE_H #include <linux/pfn.h> #include <asm/pgtable.h> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT) #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU #include <linux/mm_types.h> #include <linux/bug.h> #include <linux/errno.h> #include <asm-generic/pgtable_uffd.h> #include <linux/page_table_check.h> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same * effect as using TASK_SIZE. However, there is one configuration which * must impose a more careful limit, to avoid freeing kernel pgtables. */ #ifndef USER_PGTABLES_CEILING #define USER_PGTABLES_CEILING 0UL #endif /* * This defines the first usable user address. Platforms * can override its value with custom FIRST_USER_ADDRESS * defined in their respective <asm/pgtable.h>. */ #ifndef FIRST_USER_ADDRESS #define FIRST_USER_ADDRESS 0UL #endif /* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this * via their respective <asm/pgtable.h>. */ #ifndef pmd_pgtable #define pmd_pgtable(pmd) pmd_page(pmd) #endif #define pmd_folio(pmd) page_folio(pmd_page(pmd)) /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * * The pXx_index() functions return the index of the entry in the page * table page which would control the given virtual address * * As these functions may be used by the same code for different levels of * the page table folding, they are always available, regardless of * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0 * because in such cases PTRS_PER_PxD equals 1. */ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } #define pmd_index pmd_index #endif #ifndef pud_index static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } #define pud_index pud_index #endif #ifndef pgd_index /* Must be a compile-time constant, so implement it as a macro */ #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif #ifndef kernel_pte_init static inline void kernel_pte_init(void *addr) { } #define kernel_pte_init kernel_pte_init #endif #ifndef pmd_init static inline void pmd_init(void *addr) { } #define pmd_init pmd_init #endif #ifndef pud_init static inline void pud_init(void *addr) { } #define pud_init pud_init #endif #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); } #define pte_offset_kernel pte_offset_kernel #endif #ifdef CONFIG_HIGHPTE #define __pte_map(pmd, address) \ ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) #define pte_unmap(pte) do { \ kunmap_local((pte)); \ rcu_read_unlock(); \ } while (0) #else static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline void pte_unmap(pte_t *pte) { rcu_read_unlock(); } #endif void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return pud_pgtable(*pud) + pmd_index(address); } #define pmd_offset pmd_offset #endif #ifndef pud_offset static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { return p4d_pgtable(*p4d) + pud_index(address); } #define pud_offset pud_offset #endif static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address) { return (pgd + pgd_index(address)); }; /* * a shortcut to get a pgd_t in a given mm */ #ifndef pgd_offset #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) #endif /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ #define pgd_offset_k(address) pgd_offset(&init_mm, (address)) /* * In many cases it is known that a virtual address is mapped at PMD or PTE * level, so instead of traversing all the page table levels, we can get a * pointer to the PMD entry in user or kernel page table or translate a virtual * address to the pointer in the PTE in the kernel page tables with simple * helpers. */ static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_off_k(unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) { pmd_t *pmd = pmd_off_k(vaddr); return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } #ifndef pmd_young static inline int pmd_young(pmd_t pmd) { return 0; } #endif #ifndef pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return 0; } #endif /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode * is issued. Some architectures may benefit from doing this, and it is * beneficial for both shadow and direct mode hypervisors, which may batch * the PTE updates which happen during this window. Note that using this * interface requires that read hazards be removed from the code. A read * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be * up to date. * * In the general case, no lock is guaranteed to be held between entry and exit * of the lazy mode. So the implementation must assume preemption may be enabled * and cpu migration is possible; it must take steps to be robust against this. * (In practice, for user PTE updates, the appropriate page table lock(s) are * held, but for kernel PTE updates, no lock is held). Nesting is not permitted * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) {} static inline void arch_leave_lazy_mmu_mode(void) {} static inline void arch_flush_lazy_mmu_mode(void) {} #endif #ifndef pte_batch_hint /** * pte_batch_hint - Number of pages that can be added to batch without scanning. * @ptep: Page table pointer for the entry. * @pte: Page table entry. * * Some architectures know that a set of contiguous ptes all map the same * contiguous memory with the same permissions. In this case, it can provide a * hint to aid pte batching without the core code needing to scan every pte. * * An architecture implementation may ignore the PTE accessed state. Further, * the dirty state must apply atomically to all the PTEs described by the hint. * * May be overridden by the architecture, else pte_batch_hint is always 1. */ static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) { return 1; } #endif #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #endif #define pte_next_pfn(pte) pte_advance_pfn(pte, 1) #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. * @addr: Address to map the first page at. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @nr: Number of pages to map. * * When nr==1, initial state of pte may be present or not present, and new state * may be present or not present. When nr>1, initial state of all ptes must be * not present, and new state must be present. * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * * Context: The caller holds the page table lock. The pages all belong * to the same folio. The PTEs are all in the same PMD. */ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, ptep, pte, nr); for (;;) { set_pte(ptep, pte); if (--nr == 0) break; ptep++; pte = pte_next_pfn(pte); } } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { BUILD_BUG(); return 0; } static inline int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef ptep_get static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif #ifndef pmdp_get static inline pmd_t pmdp_get(pmd_t *pmdp) { return READ_ONCE(*pmdp); } #endif #ifndef pudp_get static inline pud_t pudp_get(pud_t *pudp) { return READ_ONCE(*pudp); } #endif #ifndef p4dp_get static inline p4d_t p4dp_get(p4d_t *p4dp) { return READ_ONCE(*p4dp); } #endif #ifndef pgdp_get static inline pgd_t pgdp_get(pgd_t *pgdp) { return READ_ONCE(*pgdp); } #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); return r; } #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; int r = 1; if (!pmd_young(pmd)) r = 0; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); return r; } #else static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef arch_has_hw_nonleaf_pmd_young /* * Return whether the accessed bit in non-leaf PMD entries is supported on the * local CPU. */ static inline bool arch_has_hw_nonleaf_pmd_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG); } #endif #ifndef arch_has_hw_pte_young /* * Return whether the accessed bit is supported on the local CPU. * * This stub assumes accessing through an old PTE triggers a page fault. * Architectures that automatically set the access bit should overwrite it. */ static inline bool arch_has_hw_pte_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); } #endif #ifndef exec_folio_order /* * Returns preferred minimum folio order for executable file-backed memory. Must * be in range [0, PMD_ORDER). Default to order-0. */ static inline unsigned int exec_folio_order(void) { return 0; } #endif #ifndef arch_check_zapped_pte static inline void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { } #endif #ifndef arch_check_zapped_pmd static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { } #endif #ifndef arch_check_zapped_pud static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) { } #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, pte); return pte; } #endif #ifndef clear_young_dirty_ptes /** * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the * same folio as old/clean. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to mark old/clean. * @flags: Flags to modify the PTE batch semantics. * * May be overridden by the architecture; otherwise, implemented by * get_and_clear/modify/set for each pte in the range. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags) { pte_t pte; for (;;) { if (flags == CYDP_CLEAR_YOUNG) ptep_test_and_clear_young(vma, addr, ptep); else { pte = ptep_get_and_clear(vma->vm_mm, addr, ptep); if (flags & CYDP_CLEAR_YOUNG) pte = pte_mkold(pte); if (flags & CYDP_CLEAR_DIRTY) pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, addr, ptep, pte); } if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, addr, ptep); /* * No need for ptep_get_and_clear(): page table check doesn't care about * any bits that could have been set by HW concurrently. */ page_table_check_pte_clear(mm, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures * (eg x86-32 PAE) cannot load the entries atomically without using expensive * instructions. We are guaranteed that a PTE will only either go from not * present to present, or present to not present -- it will not switch to a * completely different present page without a TLB flush inbetween; which we * are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. * We load pte_high *after* loading pte_low, which ensures we don't see an older * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't * picked up a changed pte high. We might have gotten rubbish values from * pte_low and pte_high, but we are guaranteed that pte_low will not have the * present bit set *unless* it is 'l'. Because get_user_pages_fast() only * operates on present ptes we're safe. */ static inline pte_t ptep_get_lockless(pte_t *ptep) { pte_t pte; do { pte.pte_low = ptep->pte_low; smp_rmb(); pte.pte_high = ptep->pte_high; smp_rmb(); } while (unlikely(pte.pte_low != ptep->pte_low)); return pte; } #define ptep_get_lockless ptep_get_lockless #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { pmd_t pmd; do { pmd.pmd_low = pmdp->pmd_low; smp_rmb(); pmd.pmd_high = pmdp->pmd_high; smp_rmb(); } while (unlikely(pmd.pmd_low != pmdp->pmd_low)); return pmd; } #define pmdp_get_lockless pmdp_get_lockless #define pmdp_get_lockless_sync() tlb_remove_table_sync_one() #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ /* * We require that the PTE can be read atomically. */ #ifndef ptep_get_lockless static inline pte_t ptep_get_lockless(pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef pmdp_get_lockless static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { return pmdp_get(pmdp); } static inline void pmdp_get_lockless_sync(void) { } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; pmd_clear(pmdp); page_table_check_pmd_clear(mm, pmd); return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t pud = *pudp; pud_clear(pudp); page_table_check_pud_clear(mm, pud); return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { return ptep_get_and_clear(mm, address, ptep); } #endif #ifndef get_and_clear_full_ptes /** * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the * returned PTE. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { pte_t pte, tmp_pte; pte = ptep_get_and_clear_full(mm, addr, ptep, full); while (--nr) { ptep++; addr += PAGE_SIZE; tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full); if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } return pte; } #endif /** * get_and_clear_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * * Use this instead of get_and_clear_full_ptes() if it is known that we don't * need to clear the full mm, which is mostly the case. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { return get_and_clear_full_ptes(mm, addr, ptep, nr, 0); } #ifndef clear_full_ptes /** * clear_full_ptes - Clear present PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { ptep_get_and_clear_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif /** * clear_ptes - Clear present PTEs that map consecutive pages of the same folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * * Use this instead of clear_full_ptes() if it is known that we don't need to * clear the full mm, which is mostly the case. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { clear_full_ptes(mm, addr, ptep, nr, 0); } /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread * gives up, simply does nothing, and continues; on architectures where * software can update TLB, local TLB can be updated here to avoid next page * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ #ifndef update_mmu_tlb_range static inline void update_mmu_tlb_range(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int nr) { } #endif static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { update_mmu_tlb_range(vma, address, ptep, 1); } /* * Some architectures may be able to avoid expensive synchronization * primitives when modifications are made to PTE's which are already * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL static inline void pte_clear_not_present_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_clear(mm, address, ptep); } #endif #ifndef clear_not_present_full_ptes /** * clear_not_present_full_ptes - Clear multiple not present PTEs which are * consecutive in the pgtable. * @mm: Address space the ptes represent. * @addr: Address of the first pte. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over pte_clear_not_present_full(). * * Context: The caller holds the page table lock. The PTEs are all not present. * The PTEs are all in the same PMD. */ static inline void clear_not_present_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { pte_clear_not_present_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); #endif #ifndef pte_mkwrite static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { return pte_mkwrite_novma(pte); } #endif #if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite) static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { return pmd_mkwrite_novma(pmd); } #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to write-protect. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_set_wrprotect(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { for (;;) { ptep_set_wrprotect(mm, addr, ptep); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ #ifndef pte_sw_mkyoung static inline pte_t pte_sw_mkyoung(pte_t pte) { return pte; } #define pte_sw_mkyoung pte_sw_mkyoung #endif #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t old_pmd = *pmdp; set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); } #else static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t old_pud = *pudp; set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); } #else static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return *pmdp; } #define pmdp_collapse_flush pmdp_collapse_flush #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we * can't race with CPU which sets these bits and non-atomic approach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { pmd_t old_pmd = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd); return old_pmd; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD /* * pmdp_invalidate_ad() invalidates the PMD while changing a transparent * hugepage mapping in the page tables. This function is similar to * pmdp_invalidate(), but should only be used if the access and dirty bits would * not be cleared by the software in the new PMD value. The function ensures * that hardware changes of the access and dirty bits updates would not be lost. * * Doing so can allow in certain architectures to avoid a TLB flush in most * cases. Yet, another TLB flush might be necessary later if the PMD update * itself requires such flush (e.g., if protection was set to be stricter). Yet, * even when a TLB flush is needed because of the update, the caller may be able * to batch these TLB flushing operations, so fewer TLB flush operations are * needed. */ extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { return pte_val(pte_a) == pte_val(pte_b); } #endif #ifndef __HAVE_ARCH_PTE_UNUSED /* * Some architectures provide facilities to virtualization guests * so that they can flag allocated pages as unused. This allows the * host to transparently reclaim unused pages. This function returns * whether the pte's page is unused. */ static inline int pte_unused(pte_t pte) { return 0; } #endif #ifndef pte_access_permitted #define pte_access_permitted(pte, write) \ (pte_present(pte) && (!(write) || pte_write(pte))) #endif #ifndef pmd_access_permitted #define pmd_access_permitted(pmd, write) \ (pmd_present(pmd) && (!(write) || pmd_write(pmd))) #endif #ifndef pud_access_permitted #define pud_access_permitted(pud, write) \ (pud_present(pud) && (!(write) || pud_write(pud))) #endif #ifndef p4d_access_permitted #define p4d_access_permitted(p4d, write) \ (p4d_present(p4d) && (!(write) || p4d_write(p4d))) #endif #ifndef pgd_access_permitted #define pgd_access_permitted(pgd, write) \ (pgd_present(pgd) && (!(write) || pgd_write(pgd))) #endif #ifndef __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } #endif #ifndef pud_same static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } #define pud_same pud_same #endif #ifndef __HAVE_ARCH_P4D_SAME static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b) { return p4d_val(p4d_a) == p4d_val(p4d_b); } #endif #ifndef __HAVE_ARCH_PGD_SAME static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) { return pgd_val(pgd_a) == pgd_val(pgd_b); } #endif #ifndef __HAVE_ARCH_DO_SWAP_PAGE static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { } #else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { for (int i = 0; i < nr; i++) { arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, pte_advance_pfn(pte, i), pte_advance_pfn(oldpte, i)); } } #endif #ifndef __HAVE_ARCH_UNMAP_ONE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_unmap_one() can save this * metadata on a swap-out of a page. */ static inline int arch_unmap_one(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t orig_pte) { return 0; } #endif /* * Allow architectures to preserve additional metadata associated with * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP static inline int arch_prepare_to_swap(struct folio *folio) { return 0; } #endif #ifndef __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) { } static inline void arch_swap_invalidate_area(int type) { } #endif #ifndef __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { } #endif #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, old_addr, new_addr) (pte) #endif #ifndef pte_accessible # define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif #ifndef flush_tlb_fix_spurious_fault_pmd #define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0) #endif /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. */ #define pgd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #ifndef p4d_addr_end #define p4d_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pud_addr_end #define pud_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pmd_addr_end #define pmd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif /* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. * Do the tests inline, but report and clear the bad entry in mm/memory.c. */ void pgd_clear_bad(pgd_t *); #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); #else #define p4d_clear_bad(p4d) do { } while (0) #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); #else #define pud_clear_bad(p4d) do { } while (0) #endif void pmd_clear_bad(pmd_t *); static inline int pgd_none_or_clear_bad(pgd_t *pgd) { if (pgd_none(*pgd)) return 1; if (unlikely(pgd_bad(*pgd))) { pgd_clear_bad(pgd); return 1; } return 0; } static inline int p4d_none_or_clear_bad(p4d_t *p4d) { if (p4d_none(*p4d)) return 1; if (unlikely(p4d_bad(*p4d))) { p4d_clear_bad(p4d); return 1; } return 0; } static inline int pud_none_or_clear_bad(pud_t *pud) { if (pud_none(*pud)) return 1; if (unlikely(pud_bad(*pud))) { pud_clear_bad(pud); return 1; } return 0; } static inline int pmd_none_or_clear_bad(pmd_t *pmd) { if (pmd_none(*pmd)) return 1; if (unlikely(pmd_bad(*pmd))) { pmd_clear_bad(pmd); return 1; } return 0; } static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * Get the current pte state, but zero it out to make it * non-present, preventing the hardware from asynchronously * updating it. */ return ptep_get_and_clear(vma->vm_mm, addr, ptep); } static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { /* * The pte is non-present, so there's no hardware state to * preserve. */ set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION /* * Start a pte protection read-modify-write transaction, which * protects against asynchronous hardware modifications to the pte. * The intention is not to prevent the hardware from making pte * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. The pte returned from ptep_modify_prot_start() may * additionally have young and/or dirty bits set where previously they were not, * so the updated pte may have these additional changes. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ /** * modify_prot_start_ptes - Start a pte protection read-modify-write transaction * over a batch of ptes, which protects against asynchronous hardware * modifications to the ptes. The intention is not to prevent the hardware from * making pte updates, but to prevent any updates it may make from being lost. * Please see the comment above ptep_modify_prot_start() for full description. * * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte * in the batch. * * Note that PTE bits in the PTE batch besides the PFN can differ. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. All other PTE bits must be identical for * all PTEs in the batch except for young and dirty bits. The PTEs are all in * the same PMD. */ #ifndef modify_prot_start_ptes static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { pte_t pte, tmp_pte; pte = ptep_modify_prot_start(vma, addr, ptep); while (--nr) { ptep++; addr += PAGE_SIZE; tmp_pte = ptep_modify_prot_start(vma, addr, ptep); if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } return pte; } #endif /** * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any * hardware-controlled bits in the PTE unmodified. * * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @old_pte: Old page table entry (for the first entry) which is now cleared. * @pte: New page table entry to be set. * @nr: Number of entries. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_modify_prot_commit(). * * Context: The caller holds the page table lock. The PTEs are all in the same * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by * ptep_modify_prot_start() may additionally have young and/or dirty bits set * where previously they were not, so the updated ptes may have these * additional changes. */ #ifndef modify_prot_commit_ptes static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr) { int i; for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) { ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); /* Advance PFN only, set same prot */ old_pte = pte_next_pfn(old_pte); pte = pte_next_pfn(pte); } } #endif /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc, ioremap and page table update code know when * arch_sync_kernel_mappings() needs to be called. */ #ifndef ARCH_PAGE_TABLE_SYNC_MASK #define ARCH_PAGE_TABLE_SYNC_MASK 0 #endif /* * There is no default implementation for arch_sync_kernel_mappings(). It is * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK * is 0. */ void arch_sync_kernel_mappings(unsigned long start, unsigned long end); #endif /* CONFIG_MMU */ /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. */ #ifndef pgprot_nx #define pgprot_nx(prot) (prot) #endif #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif #ifndef pgprot_writecombine #define pgprot_writecombine pgprot_noncached #endif #ifndef pgprot_writethrough #define pgprot_writethrough pgprot_noncached #endif #ifndef pgprot_device #define pgprot_device pgprot_noncached #endif #ifndef pgprot_mhp #define pgprot_mhp(prot) (prot) #endif #ifdef CONFIG_MMU #ifndef pgprot_modify #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot))) newprot = pgprot_noncached(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot))) newprot = pgprot_writecombine(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot))) newprot = pgprot_device(newprot); return newprot; } #endif #endif /* CONFIG_MMU */ #ifndef pgprot_encrypted #define pgprot_encrypted(prot) (prot) #endif #ifndef pgprot_decrypted #define pgprot_decrypted(prot) (prot) #endif /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for * paravirtualized guests. By convention, only one of the batched * update (lazy) modes (CPU, MMU) should be active at any given time, * entry should never be nested, and entry and exits should always be * paired. This is for sanity of maintaining and reasoning about the * kernel code. In this case, the exit (end of the context switch) is * in architecture-specific code, and so doesn't need a generic * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH #define arch_start_context_switch(prev) do {} while (0) #endif /* * Some platforms can customize the PTE soft-dirty bit making it unavailable * even if the architecture provides the resource. * Adding this API allows architectures to add their own checks for the * devices on which the kernel is running. * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY * is part of this macro. */ #ifndef pgtable_supports_soft_dirty #define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) #endif #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */ static inline int pte_soft_dirty(pte_t pte) { return 0; } static inline int pmd_soft_dirty(pmd_t pmd) { return 0; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte; } static inline int pte_swp_soft_dirty(pte_t pte) { return 0; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, * vmf_insert_pfn. */ static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot) { return 0; } static inline int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot) { return 0; } static inline void pfnmap_untrack(unsigned long pfn, unsigned long size) { } #else /** * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to modify * * Lookup the cachemode for the pfn range starting at @pfn with the size * @size and store it in @prot, leaving other data in @prot unchanged. * * This allows for a hardware implementation to have fine-grained control of * memory cache behavior at page level granularity. Without a hardware * implementation, this function does nothing. * * Currently there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * This function can fail if the pfn range spans pfns that require differing * cachemodes. If the pfn range was previously verified to have a single * cachemode, it is sufficient to query only a single pfn. The assumption is * that this is the case for drivers using the vmf_insert_pfn*() interface. * * Returns 0 on success and -EINVAL on error. */ int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot); /** * pfnmap_track - track a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to track * * Requested the pfn range to be 'tracked' by a hardware implementation and * setup the cachemode in @prot similar to pfnmap_setup_cachemode(). * * This allows for fine-grained control of memory cache behaviour at page * level granularity. Tracking memory this way is persisted across VMA splits * (VMA merging does not apply for VM_PFNMAP). * * Currently, there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * Returns 0 on success and -EINVAL on error. */ int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot); /** * pfnmap_untrack - untrack a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * * Untrack a pfn range previously tracked through pfnmap_track(). */ void pfnmap_untrack(unsigned long pfn, unsigned long size); #endif /** * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn * @pfn: the pfn * @prot: the pgprot to modify * * Lookup the cachemode for @pfn and store it in @prot, leaving other * data in @prot unchanged. * * See pfnmap_setup_cachemode() for details. */ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) { pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } #ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; return pfn == zero_pfn; } static inline unsigned long my_zero_pfn(unsigned long addr) { extern unsigned long zero_pfn; return zero_pfn; } #endif #else static inline int is_zero_pfn(unsigned long pfn) { return 0; } static inline unsigned long my_zero_pfn(unsigned long addr) { return 0; } #endif /* CONFIG_MMU */ #ifdef CONFIG_MMU #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { return 0; } #ifndef pmd_write static inline int pmd_write(pmd_t pmd) { BUG(); return 0; } #endif /* pmd_write */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef pud_write static inline int pud_write(pud_t pud) { BUG(); return 0; } #endif /* pud_write */ #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; } #endif static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudval = READ_ONCE(*pud); if (pud_none(pudval) || pud_trans_huge(pudval)) return 1; if (unlikely(pud_bad(pudval))) { pud_clear_bad(pud); return 1; } #endif return 0; } #ifndef CONFIG_NUMA_BALANCING /* * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is * perfectly valid to indicate "no" in that case, which is why our default * implementation defaults to "always no". * * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE * page protection due to NUMA hinting. NUMA hinting faults only apply in * accessible VMAs. * * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault, * looking at the VMA accessibility is sufficient. */ static inline int pte_protnone(pte_t pte) { return 0; } static inline int pmd_protnone(pmd_t pmd) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); void p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } static inline int pud_clear_huge(pud_t *pud) { return 0; } static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) { return 0; } static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; } static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. Stock flush_tlb_range() typically has optimization to nuke the * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single THP flush will * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() #define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); #ifndef CONFIG_X86_ESPFIX64 static inline void init_espfix_bsp(void) { } #endif extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { return true; } static inline bool arch_has_pfn_modify_check(void) { return false; } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ /* * Architecture PAGE_KERNEL_* fallbacks * * Some architectures don't define certain PAGE_KERNEL_* flags. This is either * because they really don't support them, or the port needs to be updated to * reflect the required functionality. Below are a set of relatively safe * fallbacks, as best effort, which we can count on in lieu of the architectures * not defining them on their own yet. */ #ifndef PAGE_KERNEL_RO # define PAGE_KERNEL_RO PAGE_KERNEL #endif #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif /* * Page Table Modification bits for pgtbl_mod_mask. * * These are used by the p?d_alloc_track*() and p*d_populate_kernel() * functions in the generic vmalloc, ioremap and page table update code * to track at which page-table levels entries have been modified. * Based on that the code can better decide when page table changes need * to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 #define __PGTBL_PUD_MODIFIED 2 #define __PGTBL_PMD_MODIFIED 3 #define __PGTBL_PTE_MODIFIED 4 #define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) #define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) #define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) #define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) #define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; enum pgtable_level { PGTABLE_LEVEL_PTE = 0, PGTABLE_LEVEL_PMD, PGTABLE_LEVEL_PUD, PGTABLE_LEVEL_P4D, PGTABLE_LEVEL_PGD, }; static inline const char *pgtable_level_to_str(enum pgtable_level level) { switch (level) { case PGTABLE_LEVEL_PTE: return "pte"; case PGTABLE_LEVEL_PMD: return "pmd"; case PGTABLE_LEVEL_PUD: return "pud"; case PGTABLE_LEVEL_P4D: return "p4d"; case PGTABLE_LEVEL_PGD: return "pgd"; default: return "unknown"; } } #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) #ifdef CONFIG_PHYS_ADDR_T_64BIT /* * ZSMALLOC needs to know the highest PFN on 32-bit architectures * with physical address space extension, but falls back to * BITS_PER_LONG otherwise. */ #error Missing MAX_POSSIBLE_PHYSMEM_BITS definition #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif #endif #ifndef has_transparent_hugepage #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif #ifndef has_transparent_pud_hugepage #define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) #endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. */ #ifndef mm_p4d_folded #define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED) #endif #ifndef mm_pud_folded #define mm_pud_folded(mm) __is_defined(__PAGETABLE_PUD_FOLDED) #endif #ifndef mm_pmd_folded #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif #ifndef p4d_offset_lockless #define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) #endif #ifndef pud_offset_lockless #define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) #endif #ifndef pmd_offset_lockless #define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) #endif /* * pXd_leaf() is the API to check whether a pgtable entry is a huge page * mapping. It should work globally across all archs, without any * dependency on CONFIG_* options. For architectures that do not support * huge mappings on specific levels, below fallbacks will be used. * * A leaf pgtable entry should always imply the following: * * - It is a "present" entry. IOW, before using this API, please check it * with pXd_present() first. NOTE: it may not always mean the "present * bit" is set. For example, PROT_NONE entries are always "present". * * - It should _never_ be a swap entry of any type. Above "present" check * should have guarded this, but let's be crystal clear on this. * * - It should contain a huge PFN, which points to a huge page larger than * PAGE_SIZE of the platform. The PFN format isn't important here. * * - It should cover all kinds of huge mappings (i.e. pXd_trans_huge() * or hugetlb mappings). */ #ifndef pgd_leaf #define pgd_leaf(x) false #endif #ifndef p4d_leaf #define p4d_leaf(x) false #endif #ifndef pud_leaf #define pud_leaf(x) false #endif #ifndef pmd_leaf #define pmd_leaf(x) false #endif #ifndef pgd_leaf_size #define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT) #endif #ifndef p4d_leaf_size #define p4d_leaf_size(x) P4D_SIZE #endif #ifndef pud_leaf_size #define pud_leaf_size(x) PUD_SIZE #endif #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif #ifndef __pte_leaf_size #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif #define __pte_leaf_size(x,y) pte_leaf_size(y) #endif /* * We always define pmd_pfn for all archs as it's used in lots of generic * code. Now it happens too for pud_pfn (and can happen for larger * mappings too in the future; we're not there yet). Instead of defining * it for all archs (like pmd_pfn), provide a fallback. * * Note that returning 0 here means any arch that didn't define this can * get severely wrong when it hits a real pud leaf. It's arch's * responsibility to properly define it when a huge pud is possible. */ #ifndef pud_pfn #define pud_pfn(x) 0 #endif /* * Some architectures have MMUs that are configurable or selectable at boot * time. These lead to variable PTRS_PER_x. For statically allocated arrays it * helps to have a static maximum value. */ #ifndef MAX_PTRS_PER_PTE #define MAX_PTRS_PER_PTE PTRS_PER_PTE #endif #ifndef MAX_PTRS_PER_PMD #define MAX_PTRS_PER_PMD PTRS_PER_PMD #endif #ifndef MAX_PTRS_PER_PUD #define MAX_PTRS_PER_PUD PTRS_PER_PUD #endif #ifndef MAX_PTRS_PER_P4D #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif #ifndef pte_pgprot #define pte_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pmd_pgprot #define pmd_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pud_pgprot #define pud_pgprot(x) ((pgprot_t) {0}) #endif /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: * * map_type prot * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (yes) yes w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and * MAP_PRIVATE (with Enhanced PAN supported): * r: (no) no * w: (no) no * x: (yes) yes */ #define DECLARE_VM_GET_PAGE_PROT \ pgprot_t vm_get_page_prot(vm_flags_t vm_flags) \ { \ return protection_map[vm_flags & \ (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ } \ EXPORT_SYMBOL(vm_get_page_prot); #endif /* _LINUX_PGTABLE_H */
5 2 6 5 6 3 5 19 7 16 3 117 13 86 26 6 87 16 2 54 54 37 17 51 51 51 63 52 5 25 6 2 21 4 7 4 2 4 80 79 17 13 8 58 40 21 19 3 3 2 19 2 37 2 54 66 53 53 52 51 51 70 1 71 68 16 55 54 4 3 4 49 1 3 2 50 64 10 1 3 6 3 6 6 5 5 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2023 Isovalent */ #include <linux/bpf.h> #include <linux/bpf_mprog.h> static int bpf_mprog_link(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { struct bpf_link *link = ERR_PTR(-EINVAL); bool id = flags & BPF_F_ID; if (id) link = bpf_link_by_id(id_or_fd); else if (id_or_fd) link = bpf_link_get_from_fd(id_or_fd); if (IS_ERR(link)) return PTR_ERR(link); if (type && link->prog->type != type) { bpf_link_put(link); return -EINVAL; } tuple->link = link; tuple->prog = link->prog; return 0; } static int bpf_mprog_prog(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { struct bpf_prog *prog = ERR_PTR(-EINVAL); bool id = flags & BPF_F_ID; if (id) prog = bpf_prog_by_id(id_or_fd); else if (id_or_fd) prog = bpf_prog_get(id_or_fd); if (IS_ERR(prog)) return PTR_ERR(prog); if (type && prog->type != type) { bpf_prog_put(prog); return -EINVAL; } tuple->link = NULL; tuple->prog = prog; return 0; } static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { bool link = flags & BPF_F_LINK; bool id = flags & BPF_F_ID; memset(tuple, 0, sizeof(*tuple)); if (link) return bpf_mprog_link(tuple, id_or_fd, flags, type); /* If no relevant flag is set and no id_or_fd was passed, then * tuple link/prog is just NULLed. This is the case when before/ * after selects first/last position without passing fd. */ if (!id && !id_or_fd) return 0; return bpf_mprog_prog(tuple, id_or_fd, flags, type); } static void bpf_mprog_tuple_put(struct bpf_tuple *tuple) { if (tuple->link) bpf_link_put(tuple->link); else if (tuple->prog) bpf_prog_put(tuple->prog); } /* The bpf_mprog_{replace,delete}() operate on exact idx position with the * one exception that for deletion we support delete from front/back. In * case of front idx is -1, in case of back idx is bpf_mprog_total(entry). * Adjustment to first and last entry is trivial. The bpf_mprog_insert() * we have to deal with the following cases: * * idx + before: * * Insert P4 before P3: idx for old array is 1, idx for new array is 2, * hence we adjust target idx for the new array, so that memmove copies * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting * before P1 would have old idx -1 and new idx 0. * * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * * idx + after: * * Insert P4 after P2: idx for old array is 2, idx for new array is 2. * Again, memmove copies P1 and P2 to the new entry, and we insert P4 * into idx 2. Inserting after P3 would have both old/new idx at 4 aka * bpf_mprog_total(entry). * * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| * +--+--+--+ +--+--+--+--+ +--+--+--+--+ */ static int bpf_mprog_replace(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *ntuple, int idx) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; struct bpf_prog *oprog; bpf_mprog_read(entry, idx, &fp, &cp); oprog = READ_ONCE(fp->prog); bpf_mprog_write(fp, cp, ntuple); if (!ntuple->link) { WARN_ON_ONCE(cp->link); bpf_prog_put(oprog); } *entry_new = entry; return 0; } static int bpf_mprog_insert(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *ntuple, int idx, u32 flags) { int total = bpf_mprog_total(entry); struct bpf_mprog_entry *peer; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; peer = bpf_mprog_peer(entry); bpf_mprog_entry_copy(peer, entry); if (idx == total) goto insert; else if (flags & BPF_F_BEFORE) idx += 1; bpf_mprog_entry_grow(peer, idx); insert: bpf_mprog_read(peer, idx, &fp, &cp); bpf_mprog_write(fp, cp, ntuple); bpf_mprog_inc(peer); *entry_new = peer; return 0; } static int bpf_mprog_delete(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *dtuple, int idx) { int total = bpf_mprog_total(entry); struct bpf_mprog_entry *peer; peer = bpf_mprog_peer(entry); bpf_mprog_entry_copy(peer, entry); if (idx == -1) idx = 0; else if (idx == total) idx = total - 1; bpf_mprog_entry_shrink(peer, idx); bpf_mprog_dec(peer); bpf_mprog_mark_for_release(peer, dtuple); *entry_new = peer; return 0; } /* In bpf_mprog_pos_*() we evaluate the target position for the BPF * program/link that needs to be replaced, inserted or deleted for * each "rule" independently. If all rules agree on that position * or existing element, then enact replacement, addition or deletion. * If this is not the case, then the request cannot be satisfied and * we bail out with an error. */ static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog)) return tuple->link == cp->link ? i : -EBUSY; } return -ENOENT; } static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog) && (!tuple->link || tuple->link == cp->link)) return i - 1; } return tuple->prog ? -ENOENT : -1; } static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog) && (!tuple->link || tuple->link == cp->link)) return i + 1; } return tuple->prog ? -ENOENT : bpf_mprog_total(entry); } int bpf_mprog_attach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog_new, struct bpf_link *link, struct bpf_prog *prog_old, u32 flags, u32 id_or_fd, u64 revision) { struct bpf_tuple rtuple, ntuple = { .prog = prog_new, .link = link, }, otuple = { .prog = prog_old, .link = link, }; int ret, idx = -ERANGE, tidx; if (revision && revision != bpf_mprog_revision(entry)) return -ESTALE; if (bpf_mprog_exists(entry, prog_new)) return -EEXIST; ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags & ~BPF_F_REPLACE, prog_new->type); if (ret) return ret; if (flags & BPF_F_REPLACE) { tidx = bpf_mprog_pos_exact(entry, &otuple); if (tidx < 0) { ret = tidx; goto out; } idx = tidx; } else if (bpf_mprog_total(entry) == bpf_mprog_max()) { ret = -ERANGE; goto out; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < -1 ? tidx : -ERANGE; goto out; } idx = tidx; } if (flags & BPF_F_AFTER) { tidx = bpf_mprog_pos_after(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < 0 ? tidx : -ERANGE; goto out; } idx = tidx; } if (idx < -1) { if (rtuple.prog || flags) { ret = -EINVAL; goto out; } idx = bpf_mprog_total(entry); flags = BPF_F_AFTER; } if (idx >= bpf_mprog_max()) { ret = -ERANGE; goto out; } if (flags & BPF_F_REPLACE) ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx); else ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags); out: bpf_mprog_tuple_put(&rtuple); return ret; } static int bpf_mprog_fetch(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple, int idx) { int total = bpf_mprog_total(entry); struct bpf_mprog_cp *cp; struct bpf_mprog_fp *fp; struct bpf_prog *prog; struct bpf_link *link; if (idx == -1) idx = 0; else if (idx == total) idx = total - 1; bpf_mprog_read(entry, idx, &fp, &cp); prog = READ_ONCE(fp->prog); link = cp->link; /* The deletion request can either be without filled tuple in which * case it gets populated here based on idx, or with filled tuple * where the only thing we end up doing is the WARN_ON_ONCE() assert. * If we hit a BPF link at the given index, it must not be removed * from opts path. */ if (link && !tuple->link) return -EBUSY; WARN_ON_ONCE(tuple->prog && tuple->prog != prog); WARN_ON_ONCE(tuple->link && tuple->link != link); tuple->prog = prog; tuple->link = link; return 0; } int bpf_mprog_detach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog, struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision) { struct bpf_tuple rtuple, dtuple = { .prog = prog, .link = link, }; int ret, idx = -ERANGE, tidx; if (flags & BPF_F_REPLACE) return -EINVAL; if (revision && revision != bpf_mprog_revision(entry)) return -ESTALE; if (!bpf_mprog_total(entry)) return -ENOENT; ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags, prog ? prog->type : BPF_PROG_TYPE_UNSPEC); if (ret) return ret; if (dtuple.prog) { tidx = bpf_mprog_pos_exact(entry, &dtuple); if (tidx < 0) { ret = tidx; goto out; } idx = tidx; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < -1 ? tidx : -ERANGE; goto out; } idx = tidx; } if (flags & BPF_F_AFTER) { tidx = bpf_mprog_pos_after(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < 0 ? tidx : -ERANGE; goto out; } idx = tidx; } if (idx < -1) { if (rtuple.prog || flags) { ret = -EINVAL; goto out; } idx = bpf_mprog_total(entry); flags = BPF_F_AFTER; } if (idx >= bpf_mprog_max()) { ret = -ERANGE; goto out; } ret = bpf_mprog_fetch(entry, &dtuple, idx); if (ret) goto out; ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx); out: bpf_mprog_tuple_put(&rtuple); return ret; } int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct bpf_mprog_entry *entry) { u32 __user *uprog_flags, *ulink_flags; u32 __user *uprog_id, *ulink_id; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; struct bpf_prog *prog; const u32 flags = 0; u32 id, count = 0; u64 revision = 1; int i, ret = 0; if (attr->query.query_flags || attr->query.attach_flags) return -EINVAL; if (entry) { revision = bpf_mprog_revision(entry); count = bpf_mprog_total(entry); } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) return -EFAULT; if (copy_to_user(&uattr->query.count, &count, sizeof(count))) return -EFAULT; uprog_id = u64_to_user_ptr(attr->query.prog_ids); uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags); ulink_id = u64_to_user_ptr(attr->query.link_ids); ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags); if (attr->query.count == 0 || !uprog_id || !count) return 0; if (attr->query.count < count) { count = attr->query.count; ret = -ENOSPC; } for (i = 0; i < bpf_mprog_max(); i++) { bpf_mprog_read(entry, i, &fp, &cp); prog = READ_ONCE(fp->prog); if (!prog) break; id = prog->aux->id; if (copy_to_user(uprog_id + i, &id, sizeof(id))) return -EFAULT; if (uprog_flags && copy_to_user(uprog_flags + i, &flags, sizeof(flags))) return -EFAULT; id = cp->link ? cp->link->id : 0; if (ulink_id && copy_to_user(ulink_id + i, &id, sizeof(id))) return -EFAULT; if (ulink_flags && copy_to_user(ulink_flags + i, &flags, sizeof(flags))) return -EFAULT; if (i + 1 == count) break; } return ret; }
206 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 #ifndef _CRYPTO_GCM_H #define _CRYPTO_GCM_H #include <linux/errno.h> #include <crypto/aes.h> #include <crypto/gf128mul.h> #define GCM_AES_IV_SIZE 12 #define GCM_RFC4106_IV_SIZE 8 #define GCM_RFC4543_IV_SIZE 8 /* * validate authentication tag for GCM */ static inline int crypto_gcm_check_authsize(unsigned int authsize) { switch (authsize) { case 4: case 8: case 12: case 13: case 14: case 15: case 16: break; default: return -EINVAL; } return 0; } /* * validate authentication tag for RFC4106 */ static inline int crypto_rfc4106_check_authsize(unsigned int authsize) { switch (authsize) { case 8: case 12: case 16: break; default: return -EINVAL; } return 0; } /* * validate assoclen for RFC4106/RFC4543 */ static inline int crypto_ipsec_check_assoclen(unsigned int assoclen) { switch (assoclen) { case 16: case 20: break; default: return -EINVAL; } return 0; } struct aesgcm_ctx { be128 ghash_key; struct crypto_aes_ctx aes_ctx; unsigned int authsize; }; int aesgcm_expandkey(struct aesgcm_ctx *ctx, const u8 *key, unsigned int keysize, unsigned int authsize); void aesgcm_encrypt(const struct aesgcm_ctx *ctx, u8 *dst, const u8 *src, int crypt_len, const u8 *assoc, int assoc_len, const u8 iv[GCM_AES_IV_SIZE], u8 *authtag); bool __must_check aesgcm_decrypt(const struct aesgcm_ctx *ctx, u8 *dst, const u8 *src, int crypt_len, const u8 *assoc, int assoc_len, const u8 iv[GCM_AES_IV_SIZE], const u8 *authtag); #endif
9 8 1 3 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:mac type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/if_ether.h> #include <net/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 #define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:mac"); /* Type specific function prefix */ #define HTYPE hash_mac /* Member elements */ struct hash_mac4_elem { /* Zero valued IP addresses cannot be stored */ union { unsigned char ether[ETH_ALEN]; __be32 foo[2]; }; }; /* Common functions */ static bool hash_mac4_data_equal(const struct hash_mac4_elem *e1, const struct hash_mac4_elem *e2, u32 *multi) { return ether_addr_equal(e1->ether, e2->ether); } static bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_mac4_data_next(struct hash_mac4_elem *next, const struct hash_mac4_elem *e) { } #define MTYPE hash_mac4 #define HOST_MASK 32 #define IP_SET_EMIT_CREATE #define IP_SET_PROTO_UNDEF #include "ip_set_hash_gen.h" static int hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; if (opt->flags & IPSET_DIM_ONE_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); if (is_zero_ether_addr(e.ether)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_ETHER] || nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN)) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER])); if (is_zero_ether_addr(e.ether)) return -IPSET_ERR_HASH_ELEM; return adtfn(set, &e, &ext, &ext, flags); } static struct ip_set_type hash_mac_type __read_mostly = { .name = "hash:mac", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_MAC, .dimension = IPSET_DIM_ONE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_mac_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_mac_init(void) { return ip_set_type_register(&hash_mac_type); } static void __exit hash_mac_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_mac_type); } module_init(hash_mac_init); module_exit(hash_mac_fini);
1095 136 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_NULLS_H #define _LINUX_LIST_NULLS_H #include <linux/poison.h> #include <linux/const.h> /* * Special version of lists, where end of list is not a NULL pointer, * but a 'nulls' marker, which can have many different values. * (up to 2^31 different values guaranteed on all platforms) * * In the standard hlist, termination of a list is the NULL pointer. * In this special 'nulls' variant, we use the fact that objects stored in * a list are aligned on a word (4 or 8 bytes alignment). * We therefore use the last significant bit of 'ptr' : * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1) * Set to 0 : This is a pointer to some object (ptr) */ struct hlist_nulls_head { struct hlist_nulls_node *first; }; struct hlist_nulls_node { struct hlist_nulls_node *next, **pprev; }; #define NULLS_MARKER(value) (1UL | (((long)value) << 1)) #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) #define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)} #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_nulls_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \ }) /** * ptr_is_a_nulls - Test if a ptr is a nulls * @ptr: ptr to be tested * */ static inline int is_a_nulls(const struct hlist_nulls_node *ptr) { return ((unsigned long)ptr & 1); } /** * get_nulls_value - Get the 'nulls' value of the end of chain * @ptr: end of chain * * Should be called only if is_a_nulls(ptr); */ static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr) { return ((unsigned long)ptr) >> 1; } /** * hlist_nulls_unhashed - Has node been removed and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed state. * For example, hlist_del_init_rcu() leaves the node in unhashed state, * but hlist_nulls_del() does not. */ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) { return !h->pprev; } /** * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed state. * For example, hlist_del_init_rcu() leaves the node in unhashed state, * but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this * function may be used locklessly. */ static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h) { return !READ_ONCE(h->pprev); } static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) { return is_a_nulls(READ_ONCE(h->first)); } static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *first = h->first; n->next = first; WRITE_ONCE(n->pprev, &h->first); h->first = n; if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } static inline void __hlist_nulls_del(struct hlist_nulls_node *n) { struct hlist_nulls_node *next = n->next; struct hlist_nulls_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (!is_a_nulls(next)) WRITE_ONCE(next->pprev, pprev); } static inline void hlist_nulls_del(struct hlist_nulls_node *n) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_nulls_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * */ #define hlist_nulls_for_each_entry(tpos, pos, head, member) \ for (pos = (head)->first; \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @member: the name of the hlist_node within the struct. * */ #define hlist_nulls_for_each_entry_from(tpos, pos, member) \ for (; (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) #endif
688 594 594 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __NET_PSP_HELPERS_H #define __NET_PSP_HELPERS_H #include <linux/skbuff.h> #include <linux/rcupdate.h> #include <linux/udp.h> #include <net/sock.h> #include <net/tcp.h> #include <net/psp/types.h> struct inet_timewait_sock; /* Driver-facing API */ struct psp_dev * psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, void *priv_ptr); void psp_dev_unregister(struct psp_dev *psd); bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, u8 ver, __be16 sport); int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv); /* Kernel-facing API */ void psp_assoc_put(struct psp_assoc *pas); static inline void *psp_assoc_drv_data(struct psp_assoc *pas) { return pas->drv_data; } #if IS_ENABLED(CONFIG_INET_PSP) unsigned int psp_key_size(u32 version); void psp_sk_assoc_free(struct sock *sk); void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); void psp_twsk_assoc_free(struct inet_timewait_sock *tw); void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb); static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { return rcu_dereference_check(sk->psp_assoc, lockdep_sock_is_held(sk)); } static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { struct psp_assoc *pas; pas = psp_sk_assoc(sk); if (pas && pas->tx.spi) skb->decrypted = 1; } static inline unsigned long __psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, unsigned long diffs) { struct psp_skb_ext *a, *b; a = skb_ext_find(one, SKB_EXT_PSP); b = skb_ext_find(two, SKB_EXT_PSP); diffs |= (!!a) ^ (!!b); if (!diffs && unlikely(a)) diffs |= memcmp(a, b, sizeof(*a)); return diffs; } static inline bool psp_is_allowed_nondata(struct sk_buff *skb, struct psp_assoc *pas) { bool fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); u32 end_seq = TCP_SKB_CB(skb)->end_seq; u32 seq = TCP_SKB_CB(skb)->seq; bool pure_fin; pure_fin = fin && end_seq - seq == 1; return seq == end_seq || (pure_fin && seq == pas->upgrade_seq); } static inline bool psp_pse_matches_pas(struct psp_skb_ext *pse, struct psp_assoc *pas) { return pse && pas->rx.spi == pse->spi && pas->generation == pse->generation && pas->version == pse->version && pas->dev_id == pse->dev_id; } static inline enum skb_drop_reason __psp_sk_rx_policy_check(struct sk_buff *skb, struct psp_assoc *pas) { struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP); if (!pas) return pse ? SKB_DROP_REASON_PSP_INPUT : 0; if (likely(psp_pse_matches_pas(pse, pas))) { if (unlikely(!pas->peer_tx)) pas->peer_tx = 1; return 0; } if (!pse) { if (!pas->tx.spi || (!pas->peer_tx && psp_is_allowed_nondata(skb, pas))) return 0; } return SKB_DROP_REASON_PSP_INPUT; } static inline enum skb_drop_reason psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) { return __psp_sk_rx_policy_check(skb, psp_sk_assoc(sk)); } static inline enum skb_drop_reason psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) { return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); } static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) { struct psp_assoc *pas; int state; state = READ_ONCE(sk->sk_state); if (!sk_is_inet(sk) || state == TCP_NEW_SYN_RECV) return NULL; pas = state == TCP_TIME_WAIT ? rcu_dereference(inet_twsk(sk)->psp_assoc) : rcu_dereference(sk->psp_assoc); return pas; } static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { if (!skb->decrypted || !skb->sk) return NULL; return psp_sk_get_assoc_rcu(skb->sk); } static inline unsigned int psp_sk_overhead(const struct sock *sk) { int psp_encap = sizeof(struct udphdr) + PSP_HDR_SIZE + PSP_TRL_SIZE; bool has_psp = rcu_access_pointer(sk->psp_assoc); return has_psp ? psp_encap : 0; } #else static inline void psp_sk_assoc_free(struct sock *sk) { } static inline void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } static inline void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { } static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { return NULL; } static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } static inline unsigned long __psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, unsigned long diffs) { return diffs; } static inline enum skb_drop_reason psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) { return 0; } static inline enum skb_drop_reason psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) { return 0; } static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { return NULL; } static inline unsigned int psp_sk_overhead(const struct sock *sk) { return 0; } #endif static inline unsigned long psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two) { return __psp_skb_coalesce_diff(one, two, 0); } #endif /* __NET_PSP_HELPERS_H */
61 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Trace point definitions for core RDMA functions. * * Author: Chuck Lever <chuck.lever@oracle.com> * * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rdma_core #if !defined(_TRACE_RDMA_CORE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RDMA_CORE_H #include <linux/tracepoint.h> #include <rdma/ib_verbs.h> /* * enum ib_poll_context, from include/rdma/ib_verbs.h */ #define IB_POLL_CTX_LIST \ ib_poll_ctx(DIRECT) \ ib_poll_ctx(SOFTIRQ) \ ib_poll_ctx(WORKQUEUE) \ ib_poll_ctx_end(UNBOUND_WORKQUEUE) #undef ib_poll_ctx #undef ib_poll_ctx_end #define ib_poll_ctx(x) TRACE_DEFINE_ENUM(IB_POLL_##x); #define ib_poll_ctx_end(x) TRACE_DEFINE_ENUM(IB_POLL_##x); IB_POLL_CTX_LIST #undef ib_poll_ctx #undef ib_poll_ctx_end #define ib_poll_ctx(x) { IB_POLL_##x, #x }, #define ib_poll_ctx_end(x) { IB_POLL_##x, #x } #define rdma_show_ib_poll_ctx(x) \ __print_symbolic(x, IB_POLL_CTX_LIST) /** ** Completion Queue events **/ TRACE_EVENT(cq_schedule, TP_PROTO( struct ib_cq *cq ), TP_ARGS(cq), TP_STRUCT__entry( __field(u32, cq_id) ), TP_fast_assign( cq->timestamp = ktime_get(); cq->interrupt = true; __entry->cq_id = cq->res.id; ), TP_printk("cq.id=%u", __entry->cq_id) ); TRACE_EVENT(cq_reschedule, TP_PROTO( struct ib_cq *cq ), TP_ARGS(cq), TP_STRUCT__entry( __field(u32, cq_id) ), TP_fast_assign( cq->timestamp = ktime_get(); cq->interrupt = false; __entry->cq_id = cq->res.id; ), TP_printk("cq.id=%u", __entry->cq_id) ); TRACE_EVENT(cq_process, TP_PROTO( const struct ib_cq *cq ), TP_ARGS(cq), TP_STRUCT__entry( __field(u32, cq_id) __field(bool, interrupt) __field(s64, latency) ), TP_fast_assign( ktime_t latency = ktime_sub(ktime_get(), cq->timestamp); __entry->cq_id = cq->res.id; __entry->latency = ktime_to_us(latency); __entry->interrupt = cq->interrupt; ), TP_printk("cq.id=%u wake-up took %lld [us] from %s", __entry->cq_id, __entry->latency, __entry->interrupt ? "interrupt" : "reschedule" ) ); TRACE_EVENT(cq_poll, TP_PROTO( const struct ib_cq *cq, int requested, int rc ), TP_ARGS(cq, requested, rc), TP_STRUCT__entry( __field(u32, cq_id) __field(int, requested) __field(int, rc) ), TP_fast_assign( __entry->cq_id = cq->res.id; __entry->requested = requested; __entry->rc = rc; ), TP_printk("cq.id=%u requested %d, returned %d", __entry->cq_id, __entry->requested, __entry->rc ) ); TRACE_EVENT(cq_drain_complete, TP_PROTO( const struct ib_cq *cq ), TP_ARGS(cq), TP_STRUCT__entry( __field(u32, cq_id) ), TP_fast_assign( __entry->cq_id = cq->res.id; ), TP_printk("cq.id=%u", __entry->cq_id ) ); TRACE_EVENT(cq_modify, TP_PROTO( const struct ib_cq *cq, u16 comps, u16 usec ), TP_ARGS(cq, comps, usec), TP_STRUCT__entry( __field(u32, cq_id) __field(unsigned int, comps) __field(unsigned int, usec) ), TP_fast_assign( __entry->cq_id = cq->res.id; __entry->comps = comps; __entry->usec = usec; ), TP_printk("cq.id=%u comps=%u usec=%u", __entry->cq_id, __entry->comps, __entry->usec ) ); TRACE_EVENT(cq_alloc, TP_PROTO( const struct ib_cq *cq, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx ), TP_ARGS(cq, nr_cqe, comp_vector, poll_ctx), TP_STRUCT__entry( __field(u32, cq_id) __field(int, nr_cqe) __field(int, comp_vector) __field(unsigned long, poll_ctx) ), TP_fast_assign( __entry->cq_id = cq->res.id; __entry->nr_cqe = nr_cqe; __entry->comp_vector = comp_vector; __entry->poll_ctx = poll_ctx; ), TP_printk("cq.id=%u nr_cqe=%d comp_vector=%d poll_ctx=%s", __entry->cq_id, __entry->nr_cqe, __entry->comp_vector, rdma_show_ib_poll_ctx(__entry->poll_ctx) ) ); TRACE_EVENT(cq_alloc_error, TP_PROTO( int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx, int rc ), TP_ARGS(nr_cqe, comp_vector, poll_ctx, rc), TP_STRUCT__entry( __field(int, rc) __field(int, nr_cqe) __field(int, comp_vector) __field(unsigned long, poll_ctx) ), TP_fast_assign( __entry->rc = rc; __entry->nr_cqe = nr_cqe; __entry->comp_vector = comp_vector; __entry->poll_ctx = poll_ctx; ), TP_printk("nr_cqe=%d comp_vector=%d poll_ctx=%s rc=%d", __entry->nr_cqe, __entry->comp_vector, rdma_show_ib_poll_ctx(__entry->poll_ctx), __entry->rc ) ); TRACE_EVENT(cq_free, TP_PROTO( const struct ib_cq *cq ), TP_ARGS(cq), TP_STRUCT__entry( __field(u32, cq_id) ), TP_fast_assign( __entry->cq_id = cq->res.id; ), TP_printk("cq.id=%u", __entry->cq_id) ); /** ** Memory Region events **/ /* * enum ib_mr_type, from include/rdma/ib_verbs.h */ #define IB_MR_TYPE_LIST \ ib_mr_type_item(MEM_REG) \ ib_mr_type_item(SG_GAPS) \ ib_mr_type_item(DM) \ ib_mr_type_item(USER) \ ib_mr_type_item(DMA) \ ib_mr_type_end(INTEGRITY) #undef ib_mr_type_item #undef ib_mr_type_end #define ib_mr_type_item(x) TRACE_DEFINE_ENUM(IB_MR_TYPE_##x); #define ib_mr_type_end(x) TRACE_DEFINE_ENUM(IB_MR_TYPE_##x); IB_MR_TYPE_LIST #undef ib_mr_type_item #undef ib_mr_type_end #define ib_mr_type_item(x) { IB_MR_TYPE_##x, #x }, #define ib_mr_type_end(x) { IB_MR_TYPE_##x, #x } #define rdma_show_ib_mr_type(x) \ __print_symbolic(x, IB_MR_TYPE_LIST) TRACE_EVENT(mr_alloc, TP_PROTO( const struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg, const struct ib_mr *mr ), TP_ARGS(pd, mr_type, max_num_sg, mr), TP_STRUCT__entry( __field(u32, pd_id) __field(u32, mr_id) __field(u32, max_num_sg) __field(int, rc) __field(unsigned long, mr_type) ), TP_fast_assign( __entry->pd_id = pd->res.id; if (IS_ERR(mr)) { __entry->mr_id = 0; __entry->rc = PTR_ERR(mr); } else { __entry->mr_id = mr->res.id; __entry->rc = 0; } __entry->max_num_sg = max_num_sg; __entry->mr_type = mr_type; ), TP_printk("pd.id=%u mr.id=%u type=%s max_num_sg=%u rc=%d", __entry->pd_id, __entry->mr_id, rdma_show_ib_mr_type(__entry->mr_type), __entry->max_num_sg, __entry->rc) ); TRACE_EVENT(mr_integ_alloc, TP_PROTO( const struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg, const struct ib_mr *mr ), TP_ARGS(pd, max_num_data_sg, max_num_meta_sg, mr), TP_STRUCT__entry( __field(u32, pd_id) __field(u32, mr_id) __field(u32, max_num_data_sg) __field(u32, max_num_meta_sg) __field(int, rc) ), TP_fast_assign( __entry->pd_id = pd->res.id; if (IS_ERR(mr)) { __entry->mr_id = 0; __entry->rc = PTR_ERR(mr); } else { __entry->mr_id = mr->res.id; __entry->rc = 0; } __entry->max_num_data_sg = max_num_data_sg; __entry->max_num_meta_sg = max_num_meta_sg; ), TP_printk("pd.id=%u mr.id=%u max_num_data_sg=%u max_num_meta_sg=%u rc=%d", __entry->pd_id, __entry->mr_id, __entry->max_num_data_sg, __entry->max_num_meta_sg, __entry->rc) ); TRACE_EVENT(mr_dereg, TP_PROTO( const struct ib_mr *mr ), TP_ARGS(mr), TP_STRUCT__entry( __field(u32, id) ), TP_fast_assign( __entry->id = mr->res.id; ), TP_printk("mr.id=%u", __entry->id) ); #endif /* _TRACE_RDMA_CORE_H */ #include <trace/define_trace.h>
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_DEVICE_H #define _SCSI_SCSI_DEVICE_H #include <linux/list.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/blk-mq.h> #include <scsi/scsi.h> #include <linux/atomic.h> #include <linux/sbitmap.h> struct bsg_device; struct device; struct request_queue; struct scsi_cmnd; struct scsi_lun; struct scsi_sense_hdr; typedef __u64 __bitwise blist_flags_t; #define SCSI_SENSE_BUFFERSIZE 96 struct scsi_mode_data { __u32 length; __u16 block_descriptor_length; __u8 medium_type; __u8 device_specific; __u8 header_length; __u8 longlba:1; }; /* * sdev state: If you alter this, you also need to alter scsi_sysfs.c * (for the ascii descriptions) and the state model enforcer: * scsi_lib:scsi_device_set_state(). */ enum scsi_device_state { SDEV_CREATED = 1, /* device created but not added to sysfs * Only internal commands allowed (for inq) */ SDEV_RUNNING, /* device properly configured * All commands allowed */ SDEV_CANCEL, /* beginning to delete device * Only error handler commands allowed */ SDEV_DEL, /* device deleted * no commands allowed */ SDEV_QUIESCE, /* Device quiescent. No block commands * will be accepted, only specials (which * originate in the mid-layer) */ SDEV_OFFLINE, /* Device offlined (by error handling or * user request */ SDEV_TRANSPORT_OFFLINE, /* Offlined by transport class error handler */ SDEV_BLOCK, /* Device blocked by scsi lld. No * scsi commands from user or midlayer * should be issued to the scsi * lld. */ SDEV_CREATED_BLOCK, /* same as above but for created devices */ }; enum scsi_scan_mode { SCSI_SCAN_INITIAL = 0, SCSI_SCAN_RESCAN, SCSI_SCAN_MANUAL, }; enum scsi_device_event { SDEV_EVT_MEDIA_CHANGE = 1, /* media has changed */ SDEV_EVT_INQUIRY_CHANGE_REPORTED, /* 3F 03 UA reported */ SDEV_EVT_CAPACITY_CHANGE_REPORTED, /* 2A 09 UA reported */ SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED, /* 38 07 UA reported */ SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED, /* 2A 01 UA reported */ SDEV_EVT_LUN_CHANGE_REPORTED, /* 3F 0E UA reported */ SDEV_EVT_ALUA_STATE_CHANGE_REPORTED, /* 2A 06 UA reported */ SDEV_EVT_POWER_ON_RESET_OCCURRED, /* 29 00 UA reported */ SDEV_EVT_FIRST = SDEV_EVT_MEDIA_CHANGE, SDEV_EVT_LAST = SDEV_EVT_POWER_ON_RESET_OCCURRED, SDEV_EVT_MAXBITS = SDEV_EVT_LAST + 1 }; struct scsi_event { enum scsi_device_event evt_type; struct list_head node; /* put union of data structures, for non-simple event types, * here */ }; /** * struct scsi_vpd - SCSI Vital Product Data * @rcu: For kfree_rcu(). * @len: Length in bytes of @data. * @data: VPD data as defined in various T10 SCSI standard documents. */ struct scsi_vpd { struct rcu_head rcu; int len; unsigned char data[]; }; struct scsi_device { struct Scsi_Host *host; struct request_queue *request_queue; /* the next two are protected by the host->host_lock */ struct list_head siblings; /* list of all devices on this host */ struct list_head same_target_siblings; /* just the devices sharing same target id */ struct sbitmap budget_map; atomic_t device_blocked; /* Device returned QUEUE_FULL. */ atomic_t restarts; spinlock_t list_lock; struct list_head starved_entry; unsigned short queue_depth; /* How deep of a queue we want */ unsigned short max_queue_depth; /* max queue depth */ unsigned short last_queue_full_depth; /* These two are used by */ unsigned short last_queue_full_count; /* scsi_track_queue_full() */ unsigned long last_queue_full_time; /* last queue full time */ unsigned long queue_ramp_up_period; /* ramp up period in jiffies */ #define SCSI_DEFAULT_RAMP_UP_PERIOD (120 * HZ) unsigned long last_queue_ramp_up; /* last queue ramp up time */ unsigned int id, channel; u64 lun; unsigned int manufacturer; /* Manufacturer of device, for using * vendor-specific cmd's */ unsigned sector_size; /* size in bytes */ void *hostdata; /* available to low-level driver */ unsigned char type; char scsi_level; char inq_periph_qual; /* PQ from INQUIRY data */ struct mutex inquiry_mutex; unsigned char inquiry_len; /* valid bytes in 'inquiry' */ unsigned char * inquiry; /* INQUIRY response data */ const char * vendor; /* [back_compat] point into 'inquiry' ... */ const char * model; /* ... after scan; point to static string */ const char * rev; /* ... "nullnullnullnull" before scan */ #define SCSI_DEFAULT_VPD_LEN 255 /* default SCSI VPD page size (max) */ struct scsi_vpd __rcu *vpd_pg0; struct scsi_vpd __rcu *vpd_pg83; struct scsi_vpd __rcu *vpd_pg80; struct scsi_vpd __rcu *vpd_pg89; struct scsi_vpd __rcu *vpd_pgb0; struct scsi_vpd __rcu *vpd_pgb1; struct scsi_vpd __rcu *vpd_pgb2; struct scsi_vpd __rcu *vpd_pgb7; struct scsi_target *sdev_target; blist_flags_t sdev_bflags; /* black/white flags as also found in * scsi_devinfo.[hc]. For now used only to * pass settings from sdev_init to scsi * core. */ unsigned int eh_timeout; /* Error handling timeout */ /* * If true, let the high-level device driver (sd) manage the device * power state for system suspend/resume (suspend to RAM and * hibernation) operations. */ unsigned manage_system_start_stop:1; /* * If true, let the high-level device driver (sd) manage the device * power state for runtime device suspand and resume operations. */ unsigned manage_runtime_start_stop:1; /* * If true, let the high-level device driver (sd) manage the device * power state for system shutdown (power off) operations. */ unsigned manage_shutdown:1; /* * If true, let the high-level device driver (sd) manage the device * power state for system restart (reboot) operations. */ unsigned manage_restart:1; /* * If set and if the device is runtime suspended, ask the high-level * device driver (sd) to force a runtime resume of the device. */ unsigned force_runtime_start_on_system_start:1; /* * Set if the device is an ATA device. */ unsigned is_ata:1; unsigned removable:1; unsigned changed:1; /* Data invalid due to media change */ unsigned busy:1; /* Used to prevent races */ unsigned lockable:1; /* Able to prevent media removal */ unsigned locked:1; /* Media removal disabled */ unsigned borken:1; /* Tell the Seagate driver to be * painfully slow on this device */ unsigned disconnect:1; /* can disconnect */ unsigned soft_reset:1; /* Uses soft reset option */ unsigned sdtr:1; /* Device supports SDTR messages */ unsigned wdtr:1; /* Device supports WDTR messages */ unsigned ppr:1; /* Device supports PPR messages */ unsigned tagged_supported:1; /* Supports SCSI-II tagged queuing */ unsigned simple_tags:1; /* simple queue tag messages are enabled */ unsigned was_reset:1; /* There was a bus reset on the bus for * this device */ unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN * because we did a bus reset. */ unsigned use_10_for_rw:1; /* first try 10-byte read / write */ unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */ unsigned set_dbd_for_ms:1; /* Set "DBD" field in mode sense */ unsigned read_before_ms:1; /* perform a READ before MODE SENSE */ unsigned no_report_opcodes:1; /* no REPORT SUPPORTED OPERATION CODES */ unsigned no_write_same:1; /* no WRITE SAME command */ unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */ unsigned use_16_for_sync:1; /* Use sync (16) over sync (10) */ unsigned skip_ms_page_8:1; /* do not use MODE SENSE page 0x08 */ unsigned skip_ms_page_3f:1; /* do not use MODE SENSE page 0x3f */ unsigned skip_vpd_pages:1; /* do not read VPD pages */ unsigned try_vpd_pages:1; /* attempt to read VPD pages */ unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */ unsigned no_start_on_add:1; /* do not issue start on add */ unsigned allow_restart:1; /* issue START_UNIT in error handler */ unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */ unsigned no_uld_attach:1; /* disable connecting to upper level drivers */ unsigned select_no_atn:1; unsigned fix_capacity:1; /* READ_CAPACITY is too high by 1 */ unsigned guess_capacity:1; /* READ_CAPACITY might be too high by 1 */ unsigned retry_hwerror:1; /* Retry HARDWARE_ERROR */ unsigned last_sector_bug:1; /* do not use multisector accesses on SD_LAST_BUGGY_SECTORS */ unsigned no_read_disc_info:1; /* Avoid READ_DISC_INFO cmds */ unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */ unsigned try_rc_10_first:1; /* Try READ_CAPACACITY_10 first */ unsigned security_supported:1; /* Supports Security Protocols */ unsigned is_visible:1; /* is the device visible in sysfs */ unsigned wce_default_on:1; /* Cache is ON by default */ unsigned no_dif:1; /* T10 PI (DIF) should be disabled */ unsigned broken_fua:1; /* Don't set FUA bit */ unsigned lun_in_cdb:1; /* Store LUN bits in CDB[1] */ unsigned unmap_limit_for_ws:1; /* Use the UNMAP limit for WRITE SAME */ unsigned rpm_autosuspend:1; /* Enable runtime autosuspend at device * creation time */ unsigned ignore_media_change:1; /* Ignore MEDIA CHANGE on resume */ unsigned silence_suspend:1; /* Do not print runtime PM related messages */ unsigned no_vpd_size:1; /* No VPD size reported in header */ unsigned cdl_supported:1; /* Command duration limits supported */ unsigned cdl_enable:1; /* Enable/disable Command duration limits */ unsigned int queue_stopped; /* request queue is quiesced */ bool offline_already; /* Device offline message logged */ atomic_t ua_new_media_ctr; /* Counter for New Media UNIT ATTENTIONs */ atomic_t ua_por_ctr; /* Counter for Power On / Reset UAs */ atomic_t disk_events_disable_depth; /* disable depth for disk events */ DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */ DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */ struct list_head event_list; /* asserted events */ struct work_struct event_work; unsigned int max_device_blocked; /* what device_blocked counts down from */ #define SCSI_DEFAULT_DEVICE_BLOCKED 3 atomic_t iorequest_cnt; atomic_t iodone_cnt; atomic_t ioerr_cnt; atomic_t iotmo_cnt; struct device sdev_gendev, sdev_dev; struct work_struct requeue_work; struct scsi_device_handler *handler; void *handler_data; size_t dma_drain_len; void *dma_drain_buf; unsigned int sg_timeout; unsigned int sg_reserved_size; struct bsg_device *bsg_dev; unsigned char access_state; struct mutex state_mutex; enum scsi_device_state sdev_state; struct task_struct *quiesced_by; unsigned long sdev_data[]; } __attribute__((aligned(sizeof(unsigned long)))); #define to_scsi_device(d) \ container_of(d, struct scsi_device, sdev_gendev) #define class_to_sdev(d) \ container_of(d, struct scsi_device, sdev_dev) #define transport_class_to_sdev(class_dev) \ to_scsi_device(class_dev->parent) #define sdev_dbg(sdev, fmt, a...) \ dev_dbg(&(sdev)->sdev_gendev, fmt, ##a) /* * like scmd_printk, but the device name is passed in * as a string pointer */ __printf(4, 5) void sdev_prefix_printk(const char *, const struct scsi_device *, const char *, const char *, ...); #define sdev_printk(l, sdev, fmt, a...) \ sdev_prefix_printk(l, sdev, NULL, fmt, ##a) __printf(3, 4) void scmd_printk(const char *, struct scsi_cmnd *, const char *, ...); #define scmd_dbg(scmd, fmt, a...) \ do { \ struct request *__rq = scsi_cmd_to_rq((scmd)); \ \ if (__rq->q->disk) \ sdev_dbg((scmd)->device, "[%s] " fmt, \ __rq->q->disk->disk_name, ##a); \ else \ sdev_dbg((scmd)->device, fmt, ##a); \ } while (0) enum scsi_target_state { STARGET_CREATED = 1, STARGET_RUNNING, STARGET_REMOVE, STARGET_CREATED_REMOVE, STARGET_DEL, }; /* * scsi_target: representation of a scsi target, for now, this is only * used for single_lun devices. If no one has active IO to the target, * starget_sdev_user is NULL, else it points to the active sdev. */ struct scsi_target { struct scsi_device *starget_sdev_user; struct list_head siblings; struct list_head devices; struct device dev; struct kref reap_ref; /* last put renders target invisible */ unsigned int channel; unsigned int id; /* target id ... replace * scsi_device.id eventually */ unsigned int create:1; /* signal that it needs to be added */ unsigned int single_lun:1; /* Indicates we should only * allow I/O to one of the luns * for the device at a time. */ unsigned int pdt_1f_for_no_lun:1; /* PDT = 0x1f * means no lun present. */ unsigned int no_report_luns:1; /* Don't use * REPORT LUNS for scanning. */ unsigned int expecting_lun_change:1; /* A device has reported * a 3F/0E UA, other devices on * the same target will also. */ /* commands actually active on LLD. */ atomic_t target_busy; atomic_t target_blocked; /* * LLDs should set this in the sdev_init host template callout. * If set to zero then there is not limit. */ unsigned int can_queue; unsigned int max_target_blocked; #define SCSI_DEFAULT_TARGET_BLOCKED 3 char scsi_level; enum scsi_target_state state; void *hostdata; /* available to low-level driver */ unsigned long starget_data[]; /* for the transport */ /* starget_data must be the last element!!!! */ } __attribute__((aligned(sizeof(unsigned long)))); #define to_scsi_target(d) container_of(d, struct scsi_target, dev) static inline struct scsi_target *scsi_target(struct scsi_device *sdev) { return to_scsi_target(sdev->sdev_gendev.parent); } #define transport_class_to_starget(class_dev) \ to_scsi_target(class_dev->parent) #define starget_printk(prefix, starget, fmt, a...) \ dev_printk(prefix, &(starget)->dev, fmt, ##a) extern struct scsi_device *__scsi_add_device(struct Scsi_Host *, uint, uint, u64, void *hostdata); extern int scsi_add_device(struct Scsi_Host *host, uint channel, uint target, u64 lun); extern int scsi_register_device_handler(struct scsi_device_handler *scsi_dh); extern void scsi_remove_device(struct scsi_device *); extern int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh); void scsi_attach_vpd(struct scsi_device *sdev); void scsi_cdl_check(struct scsi_device *sdev); int scsi_cdl_enable(struct scsi_device *sdev, bool enable); extern struct scsi_device *scsi_device_from_queue(struct request_queue *q); extern int __must_check scsi_device_get(struct scsi_device *); extern void scsi_device_put(struct scsi_device *); extern struct scsi_device *scsi_device_lookup(struct Scsi_Host *, uint, uint, u64); extern struct scsi_device *__scsi_device_lookup(struct Scsi_Host *, uint, uint, u64); extern struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *, u64); extern struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *, u64); extern void starget_for_each_device(struct scsi_target *, void *, void (*fn)(struct scsi_device *, void *)); extern void __starget_for_each_device(struct scsi_target *, void *, void (*fn)(struct scsi_device *, void *)); /* only exposed to implement shost_for_each_device */ extern struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *, struct scsi_device *); /** * shost_for_each_device - iterate over all devices of a host * @sdev: the &struct scsi_device to use as a cursor * @shost: the &struct scsi_host to iterate over * * Iterator that returns each device attached to @shost. This loop * takes a reference on each device and releases it at the end. If * you break out of the loop, you must call scsi_device_put(sdev). */ #define shost_for_each_device(sdev, shost) \ for ((sdev) = __scsi_iterate_devices((shost), NULL); \ (sdev); \ (sdev) = __scsi_iterate_devices((shost), (sdev))) /** * __shost_for_each_device - iterate over all devices of a host (UNLOCKED) * @sdev: the &struct scsi_device to use as a cursor * @shost: the &struct scsi_host to iterate over * * Iterator that returns each device attached to @shost. It does _not_ * take a reference on the scsi_device, so the whole loop must be * protected by shost->host_lock. * * Note: The only reason to use this is because you need to access the * device list in interrupt context. Otherwise you really want to use * shost_for_each_device instead. */ #define __shost_for_each_device(sdev, shost) \ list_for_each_entry((sdev), &((shost)->__devices), siblings) extern int scsi_change_queue_depth(struct scsi_device *, int); extern int scsi_track_queue_full(struct scsi_device *, int); extern int scsi_set_medium_removal(struct scsi_device *, char); int scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage, int subpage, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *); extern int scsi_mode_select(struct scsi_device *sdev, int pf, int sp, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *); extern int scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries, struct scsi_sense_hdr *sshdr); extern int scsi_get_vpd_page(struct scsi_device *, u8 page, unsigned char *buf, int buf_len); int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer, unsigned int len, unsigned char opcode, unsigned short sa); extern int scsi_device_set_state(struct scsi_device *sdev, enum scsi_device_state state); extern struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type, gfp_t gfpflags); extern void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt); extern void sdev_evt_send_simple(struct scsi_device *sdev, enum scsi_device_event evt_type, gfp_t gfpflags); extern int scsi_device_quiesce(struct scsi_device *sdev); extern void scsi_device_resume(struct scsi_device *sdev); extern void scsi_target_quiesce(struct scsi_target *); extern void scsi_target_resume(struct scsi_target *); extern void scsi_scan_target(struct device *parent, unsigned int channel, unsigned int id, u64 lun, enum scsi_scan_mode rescan); extern void scsi_target_reap(struct scsi_target *); void scsi_block_targets(struct Scsi_Host *shost, struct device *dev); extern void scsi_target_unblock(struct device *, enum scsi_device_state); extern void scsi_remove_target(struct device *); extern const char *scsi_device_state_name(enum scsi_device_state); extern int scsi_is_sdev_device(const struct device *); extern int scsi_is_target_device(const struct device *); extern void scsi_sanitize_inquiry_string(unsigned char *s, int len); /* * scsi_execute_cmd users can set scsi_failure.result to have * scsi_check_passthrough fail/retry a command. scsi_failure.result can be a * specific host byte or message code, or SCMD_FAILURE_RESULT_ANY can be used * to match any host or message code. */ #define SCMD_FAILURE_RESULT_ANY 0x7fffffff /* * Set scsi_failure.result to SCMD_FAILURE_STAT_ANY to fail/retry any failure * scsi_status_is_good returns false for. */ #define SCMD_FAILURE_STAT_ANY 0xff /* * The following can be set to the scsi_failure sense, asc and ascq fields to * match on any sense, ASC, or ASCQ value. */ #define SCMD_FAILURE_SENSE_ANY 0xff #define SCMD_FAILURE_ASC_ANY 0xff #define SCMD_FAILURE_ASCQ_ANY 0xff /* Always retry a matching failure. */ #define SCMD_FAILURE_NO_LIMIT -1 struct scsi_failure { int result; u8 sense; u8 asc; u8 ascq; /* * Number of times scsi_execute_cmd will retry the failure. It does * not count for the total_allowed. */ s8 allowed; /* Number of times the failure has been retried. */ s8 retries; }; struct scsi_failures { /* * If a scsi_failure does not have a retry limit setup this limit will * be used. */ int total_allowed; int total_retries; struct scsi_failure *failure_definitions; }; /* Optional arguments to scsi_execute_cmd */ struct scsi_exec_args { unsigned char *sense; /* sense buffer */ unsigned int sense_len; /* sense buffer len */ struct scsi_sense_hdr *sshdr; /* decoded sense header */ blk_mq_req_flags_t req_flags; /* BLK_MQ_REQ flags */ int scmd_flags; /* SCMD flags */ int *resid; /* residual length */ struct scsi_failures *failures; /* failures to retry */ }; int scsi_execute_cmd(struct scsi_device *sdev, const unsigned char *cmd, blk_opf_t opf, void *buffer, unsigned int bufflen, int timeout, int retries, const struct scsi_exec_args *args); void scsi_failures_reset_retries(struct scsi_failures *failures); struct scsi_cmnd *scsi_get_internal_cmd(struct scsi_device *sdev, enum dma_data_direction data_direction, blk_mq_req_flags_t flags); void scsi_put_internal_cmd(struct scsi_cmnd *scmd); extern void sdev_disable_disk_events(struct scsi_device *sdev); extern void sdev_enable_disk_events(struct scsi_device *sdev); extern int scsi_vpd_lun_id(struct scsi_device *, char *, size_t); extern int scsi_vpd_tpg_id(struct scsi_device *, int *); #ifdef CONFIG_PM extern int scsi_autopm_get_device(struct scsi_device *); extern void scsi_autopm_put_device(struct scsi_device *); #else static inline int scsi_autopm_get_device(struct scsi_device *d) { return 0; } static inline void scsi_autopm_put_device(struct scsi_device *d) {} #endif /* CONFIG_PM */ static inline int __must_check scsi_device_reprobe(struct scsi_device *sdev) { return device_reprobe(&sdev->sdev_gendev); } static inline unsigned int sdev_channel(struct scsi_device *sdev) { return sdev->channel; } static inline unsigned int sdev_id(struct scsi_device *sdev) { return sdev->id; } #define scmd_id(scmd) sdev_id((scmd)->device) #define scmd_channel(scmd) sdev_channel((scmd)->device) /** * scsi_device_is_pseudo_dev() - Whether a device is a pseudo SCSI device. * @sdev: SCSI device to examine * * A pseudo SCSI device can be used to allocate SCSI commands but does not show * up in sysfs. Additionally, the logical unit information in *@sdev is made up. * * This function tests the LUN number instead of comparing @sdev with * @sdev->host->pseudo_sdev because this function may be called before * @sdev->host->pseudo_sdev has been initialized. */ static inline bool scsi_device_is_pseudo_dev(struct scsi_device *sdev) { return sdev->lun == U64_MAX; } /* * checks for positions of the SCSI state machine */ static inline int scsi_device_online(struct scsi_device *sdev) { return (sdev->sdev_state != SDEV_OFFLINE && sdev->sdev_state != SDEV_TRANSPORT_OFFLINE && sdev->sdev_state != SDEV_DEL); } static inline int scsi_device_blocked(struct scsi_device *sdev) { return sdev->sdev_state == SDEV_BLOCK || sdev->sdev_state == SDEV_CREATED_BLOCK; } static inline int scsi_device_created(struct scsi_device *sdev) { return sdev->sdev_state == SDEV_CREATED || sdev->sdev_state == SDEV_CREATED_BLOCK; } int scsi_internal_device_block_nowait(struct scsi_device *sdev); int scsi_internal_device_unblock_nowait(struct scsi_device *sdev, enum scsi_device_state new_state); /* accessor functions for the SCSI parameters */ static inline int scsi_device_sync(struct scsi_device *sdev) { return sdev->sdtr; } static inline int scsi_device_wide(struct scsi_device *sdev) { return sdev->wdtr; } static inline int scsi_device_dt(struct scsi_device *sdev) { return sdev->ppr; } static inline int scsi_device_dt_only(struct scsi_device *sdev) { if (sdev->inquiry_len < 57) return 0; return (sdev->inquiry[56] & 0x0c) == 0x04; } static inline int scsi_device_ius(struct scsi_device *sdev) { if (sdev->inquiry_len < 57) return 0; return sdev->inquiry[56] & 0x01; } static inline int scsi_device_qas(struct scsi_device *sdev) { if (sdev->inquiry_len < 57) return 0; return sdev->inquiry[56] & 0x02; } static inline int scsi_device_enclosure(struct scsi_device *sdev) { return sdev->inquiry ? (sdev->inquiry[6] & (1<<6)) : 1; } static inline int scsi_device_protection(struct scsi_device *sdev) { if (sdev->no_dif) return 0; return sdev->scsi_level > SCSI_2 && sdev->inquiry[5] & (1<<0); } static inline int scsi_device_tpgs(struct scsi_device *sdev) { return sdev->inquiry ? (sdev->inquiry[5] >> 4) & 0x3 : 0; } /** * scsi_device_supports_vpd - test if a device supports VPD pages * @sdev: the &struct scsi_device to test * * If the 'try_vpd_pages' flag is set it takes precedence. * Otherwise we will assume VPD pages are supported if the * SCSI level is at least SPC-3 and 'skip_vpd_pages' is not set. */ static inline int scsi_device_supports_vpd(struct scsi_device *sdev) { /* Attempt VPD inquiry if the device blacklist explicitly calls * for it. */ if (sdev->try_vpd_pages) return 1; /* * Although VPD inquiries can go to SCSI-2 type devices, * some USB ones crash on receiving them, and the pages * we currently ask for are mandatory for SPC-2 and beyond */ if (sdev->scsi_level >= SCSI_SPC_2 && !sdev->skip_vpd_pages) return 1; return 0; } static inline int scsi_device_busy(struct scsi_device *sdev) { return sbitmap_weight(&sdev->budget_map); } /* Macros to access the UNIT ATTENTION counters */ #define scsi_get_ua_new_media_ctr(sdev) atomic_read(&sdev->ua_new_media_ctr) #define scsi_get_ua_por_ctr(sdev) atomic_read(&sdev->ua_por_ctr) #define MODULE_ALIAS_SCSI_DEVICE(type) \ MODULE_ALIAS("scsi:t-" __stringify(type) "*") #define SCSI_DEVICE_MODALIAS_FMT "scsi:t-0x%02x" #endif /* _SCSI_SCSI_DEVICE_H */
139 11 2 3 136 9 133 9 7 4 5 3 123 138 139 123 123 123 122 129 143 134 123 123 123 88 122 122 123 123 122 122 122 122 122 129 129 31 31 129 129 129 158 61 129 129 129 129 41 129 129 129 129 129 86 72 72 6 122 85 2 122 122 122 86 1 86 122 38 122 37 87 87 122 122 230 3 228 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 // SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec system call core code. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/btf.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/liveupdate.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/ioport.h> #include <linux/hardirq.h> #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/utsname.h> #include <linux/numa.h> #include <linux/suspend.h> #include <linux/device.h> #include <linux/freezer.h> #include <linux/panic_notifier.h> #include <linux/pm.h> #include <linux/cpu.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/syscore_ops.h> #include <linux/compiler.h> #include <linux/hugetlb.h> #include <linux/objtool.h> #include <linux/kmsg_dump.h> #include <linux/dma-map-ops.h> #include <linux/sysfs.h> #include <asm/page.h> #include <asm/sections.h> #include <crypto/hash.h> #include "kexec_internal.h" atomic_t __kexec_lock = ATOMIC_INIT(0); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; bool kexec_file_dbg_print; /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors * where you can disable the MMU this is trivial, and easy. For * others it is still a simple predictable page table to setup. * * In that environment kexec copies the new kernel to its final * resting place. This means I can only support memory whose * physical address can fit in an unsigned long. In particular * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. * If the assembly stub has more restrictive requirements * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be * defined more restrictively in <asm/kexec.h>. * * The code for the transition from the current kernel to the * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from * virtual to physical addresses it must live in the range * 0 - TASK_SIZE, as only the user space mappings are arbitrarily * modifiable. * * The assembly stub in the control code buffer is passed a linked list * of descriptor pages detailing the source pages of the new kernel, * and the destination addresses of those source pages. As this data * structure is not used in the context of the current OS, it must * be self-contained. * * The code has been made to work with highmem pages and will use a * destination page in its final resting place (if it happens * to allocate it). The end product of this is that most of the * physical address space, and most of RAM can be used. * * Future directions include: * - allocating a page table with the control code buffer identity * mapped, to simplify machine_kexec and make kexec_on_panic more * reliable. */ /* * KIMAGE_NO_DEST is an impossible destination address..., for * allocating pages whose destination address we do not care about. */ #define KIMAGE_NO_DEST (-1UL) #define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); int sanity_check_segment_list(struct kimage *image) { int i; unsigned long nr_segments = image->nr_segments; unsigned long total_pages = 0; unsigned long nr_pages = totalram_pages(); /* * Verify we have good destination addresses. The caller is * responsible for making certain we don't attempt to load * the new image into invalid or reserved areas of RAM. This * just verifies it is an address we can use. * * Since the kernel does everything in page size chunks ensure * the destination addresses are page aligned. Too many * special cases crop of when we don't do this. The most * insidious is getting overlapping destination addresses * simply because addresses are changed to page size * granularity. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if (mstart > mend) return -EADDRNOTAVAIL; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) return -EADDRNOTAVAIL; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) return -EADDRNOTAVAIL; } /* Verify our destination addresses do not overlap. * If we alloed overlapping destination addresses * through very weird things can happen with no * easy explanation as one segment stops on another. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; unsigned long j; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; for (j = 0; j < i; j++) { unsigned long pstart, pend; pstart = image->segment[j].mem; pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) return -EINVAL; } } /* Ensure our buffer sizes are strictly less than * our memory sizes. This should always be the case, * and it is easier to check up front than to be surprised * later on. */ for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) return -EINVAL; } /* * Verify that no more than half of memory will be consumed. If the * request from userspace is too large, a large amount of time will be * wasted allocating pages, which can cause a soft lockup. */ for (i = 0; i < nr_segments; i++) { if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2) return -EINVAL; total_pages += PAGE_COUNT(image->segment[i].memsz); } if (total_pages > nr_pages / 2) return -EINVAL; #ifdef CONFIG_CRASH_DUMP /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't * attempt to load the new image into invalid or reserved * areas of RAM. But crash kernels are preloaded into a * reserved area of ram. We must ensure the addresses * are in the reserved area otherwise preloading the * kernel could corrupt things. */ if (image->type == KEXEC_TYPE_CRASH) { for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ if ((mstart < phys_to_boot_phys(crashk_res.start)) || (mend > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } } #endif /* * The destination addresses are searched from system RAM rather than * being allocated from the buddy allocator, so they are not guaranteed * to be accepted by the current kernel. Accept the destination * addresses before kexec swaps their content with the segments' source * pages to avoid accessing memory before it is accepted. */ for (i = 0; i < nr_segments; i++) accept_memory(image->segment[i].mem, image->segment[i].memsz); return 0; } struct kimage *do_kimage_alloc_init(void) { struct kimage *image; /* Allocate a controlling structure */ image = kzalloc(sizeof(*image), GFP_KERNEL); if (!image) return NULL; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ image->type = KEXEC_TYPE_DEFAULT; /* Initialize the list of control pages */ INIT_LIST_HEAD(&image->control_pages); /* Initialize the list of destination pages */ INIT_LIST_HEAD(&image->dest_pages); /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unusable_pages); #ifdef CONFIG_CRASH_HOTPLUG image->hp_action = KEXEC_CRASH_HP_NONE; image->elfcorehdr_index = -1; image->elfcorehdr_updated = false; #endif return image; } int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end) { unsigned long i; for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; if ((end >= mstart) && (start <= mend)) return 1; } return 0; } static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; if (fatal_signal_pending(current)) return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; pages->mapping = NULL; set_page_private(pages, order); count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); arch_kexec_post_alloc_pages(page_address(pages), count, gfp_mask); if (gfp_mask & __GFP_ZERO) for (i = 0; i < count; i++) clear_highpage(pages + i); } return pages; } static void kimage_free_pages(struct page *page) { unsigned int order, count, i; order = page_private(page); count = 1 << order; arch_kexec_pre_free_pages(page_address(page), count); for (i = 0; i < count; i++) ClearPageReserved(page + i); __free_pages(page, order); } void kimage_free_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) { list_del(&page->lru); kimage_free_pages(page); } } static struct page *kimage_alloc_normal_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * At worst this runs in O(N) of the image size. */ struct list_head extra_pages; struct page *pages; unsigned int count; count = 1 << order; INIT_LIST_HEAD(&extra_pages); /* Loop while I can allocate a page and the page allocated * is a destination page. */ do { unsigned long pfn, epfn, addr, eaddr; pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = (epfn << PAGE_SHIFT) - 1; if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || kimage_is_destination_range(image, addr, eaddr)) { list_add(&pages->lru, &extra_pages); pages = NULL; } } while (!pages); if (pages) { /* Remember the allocated page... */ list_add(&pages->lru, &image->control_pages); /* Because the page is already in it's destination * location we will never allocate another page at * that address. Therefore kimage_alloc_pages * will not return it (again) and we don't need * to give it an entry in image->segment[]. */ } /* Deal with the destination pages I have inadvertently allocated. * * Ideally I would convert multi-page allocations into single * page allocations, and add everything to image->dest_pages. * * For now it is simpler to just free the pages. */ kimage_free_page_list(&extra_pages); return pages; } #ifdef CONFIG_CRASH_DUMP static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * Control pages are also the only pags we must allocate * when loading a crash kernel. All of the other pages * are specified by the segments and we just memcpy * into them directly. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * Given the low demand this implements a very simple * allocator that finds the first hole of the appropriate * size in the reserved memory region, and allocates all * of the memory up to and including the hole. */ unsigned long hole_start, hole_end, size; struct page *pages; pages = NULL; size = (1 << order) << PAGE_SHIFT; hole_start = ALIGN(image->control_page, size); hole_end = hole_start + size - 1; while (hole_end <= crashk_res.end) { unsigned long i; cond_resched(); if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; /* See if I overlap any of the segments */ for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; if ((hole_end >= mstart) && (hole_start <= mend)) { /* Advance the hole to the end of the segment */ hole_start = ALIGN(mend, size); hole_end = hole_start + size - 1; break; } } /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { pages = pfn_to_page(hole_start >> PAGE_SHIFT); image->control_page = hole_end + 1; break; } } /* Ensure that these pages are decrypted if SME is enabled. */ if (pages) arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0); return pages; } #endif struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) { struct page *pages = NULL; switch (image->type) { case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_pages(image, order); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: pages = kimage_alloc_crash_control_pages(image, order); break; #endif } return pages; } static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) image->entry++; if (image->entry == image->last_entry) { kimage_entry_t *ind_page; struct page *page; page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); if (!page) return -ENOMEM; ind_page = page_address(page); *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); } *image->entry = entry; image->entry++; *image->entry = 0; return 0; } static int kimage_set_destination(struct kimage *image, unsigned long destination) { destination &= PAGE_MASK; return kimage_add_entry(image, destination | IND_DESTINATION); } static int kimage_add_page(struct kimage *image, unsigned long page) { page &= PAGE_MASK; return kimage_add_entry(image, page | IND_SOURCE); } static void kimage_free_extra_pages(struct kimage *image) { /* Walk through and free any extra destination pages I may have */ kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ kimage_free_page_list(&image->unusable_pages); } void kimage_terminate(struct kimage *image) { if (*image->entry != 0) image->entry++; *image->entry = IND_DONE; } #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION) ? \ boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; page = boot_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } static void kimage_free_cma(struct kimage *image) { unsigned long i; for (i = 0; i < image->nr_segments; i++) { struct page *cma = image->segment_cma[i]; u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT; if (!cma) continue; arch_kexec_pre_free_pages(page_address(cma), nr_pages); dma_release_from_contiguous(NULL, cma, nr_pages); image->segment_cma[i] = NULL; } } void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; kimage_entry_t ind = 0; if (!image) return; #ifdef CONFIG_CRASH_DUMP if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); vunmap(image->vmcoreinfo_data_copy); } #endif kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { /* Free the previous indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Save this indirection page until we are * done with it. */ ind = entry; } else if (entry & IND_SOURCE) kimage_free_entry(entry); } /* Free the final indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Handle any machine specific cleanup */ machine_kexec_cleanup(image); /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); /* Free CMA allocations */ kimage_free_cma(image); /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. */ if (image->file_mode) kimage_file_post_load_cleanup(image); kfree(image); } static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) { kimage_entry_t *ptr, entry; unsigned long destination = 0; for_each_kimage_entry(image, ptr, entry) { if (entry & IND_DESTINATION) destination = entry & PAGE_MASK; else if (entry & IND_SOURCE) { if (page == destination) return ptr; destination += PAGE_SIZE; } } return NULL; } static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long destination) { /* * Here we implement safeguards to ensure that a source page * is not copied to its destination page before the data on * the destination page is no longer useful. * * To do this we maintain the invariant that a source page is * either its own destination page, or it is not a * destination page at all. * * That is slightly stronger than required, but the proof * that no problems will not occur is trivial, and the * implementation is simply to verify. * * When allocating all pages normally this algorithm will run * in O(N) time, but in the worst case it will run in O(N^2) * time. If the runtime is a problem the data structures can * be fixed. */ struct page *page; unsigned long addr; /* * Walk through the list of destination pages, and see if I * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { addr = page_to_boot_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; } } page = NULL; while (1) { kimage_entry_t *old; /* Allocate a page, if we run out of memory give up */ page = kimage_alloc_pages(gfp_mask, 0); if (!page) return NULL; /* If the page cannot be used file it away */ if (page_to_boot_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_boot_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) break; /* If the page is not a destination page use it */ if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE - 1)) break; /* * I know that the page is someones destination page. * See if there is already a source page for this * destination page. And if so swap the source pages. */ old = kimage_dst_used(image, addr); if (old) { /* If so move it */ unsigned long old_addr; struct page *old_page; old_addr = *old & PAGE_MASK; old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a * destination page, so return it if it's * gfp_flags honor the ones passed in. */ if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(old_page)) { kimage_free_pages(old_page); continue; } page = old_page; break; } /* Place the page on the destination list, to be used later */ list_add(&page->lru, &image->dest_pages); } return page; } static int kimage_load_cma_segment(struct kimage *image, int idx) { struct kexec_segment *segment = &image->segment[idx]; struct page *cma = image->segment_cma[idx]; char *ptr = page_address(cma); size_t ubytes, mbytes; int result = 0; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; /* Then copy from source buffer to the CMA one */ while (mbytes) { size_t uchunk, mchunk; mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); ubytes -= uchunk; if (image->file_mode) kbuf += uchunk; else buf += uchunk; } if (result) { result = -EFAULT; goto out; } ptr += mchunk; mbytes -= mchunk; cond_resched(); } /* Clear any remainder */ memset(ptr, 0, mbytes); out: return result; } static int kimage_load_normal_segment(struct kimage *image, int idx) { struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; if (image->segment_cma[idx]) return kimage_load_cma_segment(image, idx); result = kimage_set_destination(image, maddr); if (result < 0) goto out; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); if (!page) { result = -ENOMEM; goto out; } result = kimage_add_page(image, page_to_boot_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); ubytes -= uchunk; if (image->file_mode) kbuf += uchunk; else buf += uchunk; } kunmap_local(ptr); if (result) { result = -EFAULT; goto out; } maddr += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } #ifdef CONFIG_CRASH_DUMP static int kimage_load_crash_segment(struct kimage *image, int idx) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; result = 0; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = boot_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; } arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap_local_page(page); mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (mchunk > uchunk) { /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } if (uchunk) { /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); ubytes -= uchunk; if (image->file_mode) kbuf += uchunk; else buf += uchunk; } kexec_flush_icache_page(page); kunmap_local(ptr); arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; goto out; } maddr += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } #endif int kimage_load_segment(struct kimage *image, int idx) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, idx); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: result = kimage_load_crash_segment(image, idx); break; #endif } return result; } void *kimage_map_segment(struct kimage *image, int idx) { unsigned long addr, size, eaddr; unsigned long src_page_addr, dest_page_addr = 0; kimage_entry_t *ptr, entry; struct page **src_pages; unsigned int npages; struct page *cma; void *vaddr = NULL; int i; cma = image->segment_cma[idx]; if (cma) return page_address(cma); addr = image->segment[idx].mem; size = image->segment[idx].memsz; eaddr = addr + size; /* * Collect the source pages and map them in a contiguous VA range. */ npages = PFN_UP(eaddr) - PFN_DOWN(addr); src_pages = kmalloc_array(npages, sizeof(*src_pages), GFP_KERNEL); if (!src_pages) { pr_err("Could not allocate ima pages array.\n"); return NULL; } i = 0; for_each_kimage_entry(image, ptr, entry) { if (entry & IND_DESTINATION) { dest_page_addr = entry & PAGE_MASK; } else if (entry & IND_SOURCE) { if (dest_page_addr >= addr && dest_page_addr < eaddr) { src_page_addr = entry & PAGE_MASK; src_pages[i++] = virt_to_page(__va(src_page_addr)); if (i == npages) break; dest_page_addr += PAGE_SIZE; } } } /* Sanity check. */ WARN_ON(i < npages); vaddr = vmap(src_pages, npages, VM_MAP, PAGE_KERNEL); kfree(src_pages); if (!vaddr) pr_err("Could not map ima buffer.\n"); return vaddr; } void kimage_unmap_segment(void *segment_buffer) { if (is_vmalloc_addr(segment_buffer)) vunmap(segment_buffer); } struct kexec_load_limit { /* Mutex protects the limit count. */ struct mutex mutex; int limit; }; static struct kexec_load_limit load_limit_reboot = { .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex), .limit = -1, }; static struct kexec_load_limit load_limit_panic = { .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex), .limit = -1, }; struct kimage *kexec_image; struct kimage *kexec_crash_image; static int kexec_load_disabled; #ifdef CONFIG_SYSCTL static int kexec_limit_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct kexec_load_limit *limit = table->data; int val; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(val), .mode = table->mode, }; int ret; if (write) { ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (ret) return ret; if (val < 0) return -EINVAL; mutex_lock(&limit->mutex); if (limit->limit != -1 && val >= limit->limit) ret = -EINVAL; else limit->limit = val; mutex_unlock(&limit->mutex); return ret; } mutex_lock(&limit->mutex); val = limit->limit; mutex_unlock(&limit->mutex); return proc_dointvec(&tmp, write, buffer, lenp, ppos); } static const struct ctl_table kexec_core_sysctls[] = { { .procname = "kexec_load_disabled", .data = &kexec_load_disabled, .maxlen = sizeof(int), .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE, }, { .procname = "kexec_load_limit_panic", .data = &load_limit_panic, .mode = 0644, .proc_handler = kexec_limit_handler, }, { .procname = "kexec_load_limit_reboot", .data = &load_limit_reboot, .mode = 0644, .proc_handler = kexec_limit_handler, }, }; static int __init kexec_core_sysctl_init(void) { register_sysctl_init("kernel", kexec_core_sysctls); return 0; } late_initcall(kexec_core_sysctl_init); #endif bool kexec_load_permitted(int kexec_image_type) { struct kexec_load_limit *limit; /* * Only the superuser can use the kexec syscall and if it has not * been disabled. */ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return false; /* Check limit counter and decrease it.*/ limit = (kexec_image_type == KEXEC_TYPE_CRASH) ? &load_limit_panic : &load_limit_reboot; mutex_lock(&limit->mutex); if (!limit->limit) { mutex_unlock(&limit->mutex); return false; } if (limit->limit != -1) limit->limit--; mutex_unlock(&limit->mutex); return true; } /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ int kernel_kexec(void) { int error = 0; if (!kexec_trylock()) return -EBUSY; if (!kexec_image) { error = -EINVAL; goto Unlock; } error = liveupdate_reboot(); if (error) goto Unlock; #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { /* * This flow is analogous to hibernation flows that occur * before creating an image and before jumping from the * restore kernel to the image one, so it uses the same * device callbacks as those two flows. */ pm_prepare_console(); error = freeze_processes(); if (error) { error = -EBUSY; goto Restore_console; } console_suspend_all(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_devices; /* * dpm_suspend_end() must be called after dpm_suspend_start() * to complete the transition, like in the hibernation flows * mentioned above. */ error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = suspend_disable_secondary_cpus(); if (error) goto Enable_cpus; local_irq_disable(); error = syscore_suspend(); if (error) goto Enable_irqs; } else #endif { kexec_in_progress = true; kernel_restart_prepare("kexec reboot"); migrate_to_reboot_cpu(); syscore_shutdown(); /* * migrate_to_reboot_cpu() disables CPU hotplug assuming that * no further code needs to use CPU hotplug (which is true in * the reboot case). However, the kexec path depends on using * CPU hotplug again; so re-enable it here. */ cpu_hotplug_enable(); pr_notice("Starting new kernel\n"); machine_shutdown(); } kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_kexec(kexec_image); #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { /* * This flow is analogous to hibernation flows that occur after * creating an image and after the image kernel has got control * back, and in case the devices have been reset or otherwise * manipulated in the meantime, it uses the device callbacks * used by the latter. */ syscore_resume(); Enable_irqs: local_irq_enable(); Enable_cpus: suspend_enable_secondary_cpus(); dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); console_resume_all(); thaw_processes(); Restore_console: pm_restore_console(); } #endif Unlock: kexec_unlock(); return error; } static ssize_t loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", !!kexec_image); } static struct kobj_attribute loaded_attr = __ATTR_RO(loaded); #ifdef CONFIG_CRASH_DUMP static ssize_t crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); } static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded); #ifdef CONFIG_CRASH_RESERVE static ssize_t crash_cma_ranges_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t len = 0; int i; for (i = 0; i < crashk_cma_cnt; ++i) { len += sysfs_emit_at(buf, len, "%08llx-%08llx\n", crashk_cma_ranges[i].start, crashk_cma_ranges[i].end); } return len; } static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges); #endif static ssize_t crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t size = crash_get_memory_size(); if (size < 0) return size; return sysfs_emit(buf, "%zd\n", size); } static ssize_t crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned long cnt; int ret; if (kstrtoul(buf, 0, &cnt)) return -EINVAL; ret = crash_shrink_memory(cnt); return ret < 0 ? ret : count; } static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size); #ifdef CONFIG_CRASH_HOTPLUG static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int sz = crash_get_elfcorehdr_size(); return sysfs_emit(buf, "%u\n", sz); } static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size); #endif /* CONFIG_CRASH_HOTPLUG */ #endif /* CONFIG_CRASH_DUMP */ static struct attribute *kexec_attrs[] = { &loaded_attr.attr, #ifdef CONFIG_CRASH_DUMP &crash_loaded_attr.attr, &crash_size_attr.attr, #ifdef CONFIG_CRASH_RESERVE &crash_cma_ranges_attr.attr, #endif #ifdef CONFIG_CRASH_HOTPLUG &crash_elfcorehdr_size_attr.attr, #endif #endif NULL }; struct kexec_link_entry { const char *target; const char *name; }; static struct kexec_link_entry kexec_links[] = { { "loaded", "kexec_loaded" }, #ifdef CONFIG_CRASH_DUMP { "crash_loaded", "kexec_crash_loaded" }, { "crash_size", "kexec_crash_size" }, #ifdef CONFIG_CRASH_RESERVE {"crash_cma_ranges", "kexec_crash_cma_ranges"}, #endif #ifdef CONFIG_CRASH_HOTPLUG { "crash_elfcorehdr_size", "crash_elfcorehdr_size" }, #endif #endif }; static struct kobject *kexec_kobj; ATTRIBUTE_GROUPS(kexec); static int __init init_kexec_sysctl(void) { int error; int i; kexec_kobj = kobject_create_and_add("kexec", kernel_kobj); if (!kexec_kobj) { pr_err("failed to create kexec kobject\n"); return -ENOMEM; } error = sysfs_create_groups(kexec_kobj, kexec_groups); if (error) goto kset_exit; for (i = 0; i < ARRAY_SIZE(kexec_links); i++) { error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj, kexec_links[i].target, kexec_links[i].name); if (error) pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error); } return 0; kset_exit: kobject_put(kexec_kobj); return error; } subsys_initcall(init_kexec_sysctl);
9 1 5 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/ts_fsm.c A naive finite state machine text search approach * * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== * * A finite state machine consists of n states (struct ts_fsm_token) * representing the pattern as a finite automaton. The data is read * sequentially on an octet basis. Every state token specifies the number * of recurrences and the type of value accepted which can be either a * specific character or ctype based set of characters. The available * type of recurrences include 1, (0|1), [0 n], and [1 n]. * * The algorithm differs between strict/non-strict mode specifying * whether the pattern has to start at the first octet. Strict mode * is enabled by default and can be disabled by inserting * TS_FSM_HEAD_IGNORE as the first token in the chain. * * The runtime performance of the algorithm should be around O(n), * however while in strict mode the average runtime can be better. */ #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/textsearch.h> #include <linux/textsearch_fsm.h> struct ts_fsm { unsigned int ntokens; struct ts_fsm_token tokens[]; }; /* other values derived from ctype.h */ #define _A 0x100 /* ascii */ #define _W 0x200 /* wildcard */ /* Map to _ctype flags and some magic numbers */ static const u16 token_map[TS_FSM_TYPE_MAX+1] = { [TS_FSM_SPECIFIC] = 0, [TS_FSM_WILDCARD] = _W, [TS_FSM_CNTRL] = _C, [TS_FSM_LOWER] = _L, [TS_FSM_UPPER] = _U, [TS_FSM_PUNCT] = _P, [TS_FSM_SPACE] = _S, [TS_FSM_DIGIT] = _D, [TS_FSM_XDIGIT] = _D | _X, [TS_FSM_ALPHA] = _U | _L, [TS_FSM_ALNUM] = _U | _L | _D, [TS_FSM_PRINT] = _P | _U | _L | _D | _SP, [TS_FSM_GRAPH] = _P | _U | _L | _D, [TS_FSM_ASCII] = _A, }; static const u16 token_lookup_tbl[256] = { _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 0- 3 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 4- 7 */ _W|_A|_C, _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C|_S, /* 8- 11 */ _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C, _W|_A|_C, /* 12- 15 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 16- 19 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 20- 23 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 24- 27 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 28- 31 */ _W|_A|_S|_SP, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 32- 35 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 36- 39 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 40- 43 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 44- 47 */ _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 48- 51 */ _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 52- 55 */ _W|_A|_D, _W|_A|_D, _W|_A|_P, _W|_A|_P, /* 56- 59 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 60- 63 */ _W|_A|_P, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, /* 64- 67 */ _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U, /* 68- 71 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 72- 75 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 76- 79 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 80- 83 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 84- 87 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_P, /* 88- 91 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 92- 95 */ _W|_A|_P, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, /* 96- 99 */ _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L, /* 100-103 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 104-107 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 108-111 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 112-115 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 116-119 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_P, /* 120-123 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_C, /* 124-127 */ _W, _W, _W, _W, /* 128-131 */ _W, _W, _W, _W, /* 132-135 */ _W, _W, _W, _W, /* 136-139 */ _W, _W, _W, _W, /* 140-143 */ _W, _W, _W, _W, /* 144-147 */ _W, _W, _W, _W, /* 148-151 */ _W, _W, _W, _W, /* 152-155 */ _W, _W, _W, _W, /* 156-159 */ _W|_S|_SP, _W|_P, _W|_P, _W|_P, /* 160-163 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 164-167 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 168-171 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 172-175 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 176-179 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 180-183 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 184-187 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 188-191 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 192-195 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 196-199 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 200-203 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 204-207 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 208-211 */ _W|_U, _W|_U, _W|_U, _W|_P, /* 212-215 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 216-219 */ _W|_U, _W|_U, _W|_U, _W|_L, /* 220-223 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 224-227 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 228-231 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 232-235 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 236-239 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 240-243 */ _W|_L, _W|_L, _W|_L, _W|_P, /* 244-247 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 248-251 */ _W|_L, _W|_L, _W|_L, _W|_L}; /* 252-255 */ static inline int match_token(struct ts_fsm_token *t, u8 d) { if (t->type) return (token_lookup_tbl[d] & t->type) != 0; else return t->value == d; } static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state) { struct ts_fsm *fsm = ts_config_priv(conf); struct ts_fsm_token *cur = NULL, *next; unsigned int match_start, block_idx = 0, tok_idx; unsigned block_len = 0, strict, consumed = state->offset; const u8 *data; #define GET_NEXT_BLOCK() \ ({ consumed += block_idx; \ block_idx = 0; \ block_len = conf->get_next_block(consumed, &data, conf, state); }) #define TOKEN_MISMATCH() \ do { \ if (strict) \ goto no_match; \ block_idx++; \ goto startover; \ } while(0) #define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK()) if (end_of_data()) goto no_match; strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE; startover: match_start = consumed + block_idx; for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) { cur = &fsm->tokens[tok_idx]; if (likely(tok_idx < (fsm->ntokens - 1))) next = &fsm->tokens[tok_idx + 1]; else next = NULL; switch (cur->recur) { case TS_FSM_SINGLE: if (end_of_data()) goto no_match; if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); break; case TS_FSM_PERHAPS: if (end_of_data() || !match_token(cur, data[block_idx])) continue; break; case TS_FSM_MULTI: if (end_of_data()) goto no_match; if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); block_idx++; fallthrough; case TS_FSM_ANY: if (next == NULL) goto found_match; if (end_of_data()) continue; while (!match_token(next, data[block_idx])) { if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); block_idx++; if (end_of_data()) goto no_match; } continue; /* * Optimization: Prefer small local loop over jumping * back and forth until garbage at head is munched. */ case TS_FSM_HEAD_IGNORE: if (end_of_data()) continue; while (!match_token(next, data[block_idx])) { /* * Special case, don't start over upon * a mismatch, give the user the * chance to specify the type of data * allowed to be ignored. */ if (!match_token(cur, data[block_idx])) goto no_match; block_idx++; if (end_of_data()) goto no_match; } match_start = consumed + block_idx; continue; } block_idx++; } if (end_of_data()) goto found_match; no_match: return UINT_MAX; found_match: state->offset = consumed + block_idx; return match_start; } static struct ts_config *fsm_init(const void *pattern, unsigned int len, gfp_t gfp_mask, int flags) { int i, err = -EINVAL; struct ts_config *conf; struct ts_fsm *fsm; struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern; unsigned int ntokens = len / sizeof(*tokens); size_t priv_size = sizeof(*fsm) + len; if (len % sizeof(struct ts_fsm_token) || ntokens < 1) goto errout; if (flags & TS_IGNORECASE) goto errout; for (i = 0; i < ntokens; i++) { struct ts_fsm_token *t = &tokens[i]; if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX) goto errout; if (t->recur == TS_FSM_HEAD_IGNORE && (i != 0 || i == (ntokens - 1))) goto errout; } conf = alloc_ts_config(priv_size, gfp_mask); if (IS_ERR(conf)) return conf; conf->flags = flags; fsm = ts_config_priv(conf); fsm->ntokens = ntokens; memcpy(fsm->tokens, pattern, len); for (i = 0; i < fsm->ntokens; i++) { struct ts_fsm_token *t = &fsm->tokens[i]; t->type = token_map[t->type]; } return conf; errout: return ERR_PTR(err); } static void *fsm_get_pattern(struct ts_config *conf) { struct ts_fsm *fsm = ts_config_priv(conf); return fsm->tokens; } static unsigned int fsm_get_pattern_len(struct ts_config *conf) { struct ts_fsm *fsm = ts_config_priv(conf); return fsm->ntokens * sizeof(struct ts_fsm_token); } static struct ts_ops fsm_ops = { .name = "fsm", .find = fsm_find, .init = fsm_init, .get_pattern = fsm_get_pattern, .get_pattern_len = fsm_get_pattern_len, .owner = THIS_MODULE, .list = LIST_HEAD_INIT(fsm_ops.list) }; static int __init init_fsm(void) { return textsearch_register(&fsm_ops); } static void __exit exit_fsm(void) { textsearch_unregister(&fsm_ops); } MODULE_DESCRIPTION("naive finite state machine text search"); MODULE_LICENSE("GPL"); module_init(init_fsm); module_exit(exit_fsm);
174 546 563 47 525 344 107 449 450 344 344 450 344 341 107 107 106 106 107 4 121 122 38 669 667 616 54 616 54 6 174 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 // SPDX-License-Identifier: GPL-2.0-or-later /* * Synchronous Cryptographic Hash operations. * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/scatterwalk.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "hash.h" static inline bool crypto_shash_block_only(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->base.cra_flags & CRYPTO_AHASH_ALG_BLOCK_ONLY; } static inline bool crypto_shash_final_nonzero(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->base.cra_flags & CRYPTO_AHASH_ALG_FINAL_NONZERO; } static inline bool crypto_shash_finup_max(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->base.cra_flags & CRYPTO_AHASH_ALG_FINUP_MAX; } int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { return -ENOSYS; } EXPORT_SYMBOL_GPL(shash_no_setkey); static void shash_set_needkey(struct crypto_shash *tfm, struct shash_alg *alg) { if (crypto_shash_alg_needs_key(alg)) crypto_shash_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct shash_alg *shash = crypto_shash_alg(tfm); int err; err = shash->setkey(tfm, key, keylen); if (unlikely(err)) { shash_set_needkey(tfm, shash); return err; } crypto_shash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_shash_setkey); static int __crypto_shash_init(struct shash_desc *desc) { struct crypto_shash *tfm = desc->tfm; if (crypto_shash_block_only(tfm)) { u8 *buf = shash_desc_ctx(desc); buf += crypto_shash_descsize(tfm) - 1; *buf = 0; } return crypto_shash_alg(tfm)->init(desc); } int crypto_shash_init(struct shash_desc *desc) { if (crypto_shash_get_flags(desc->tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return __crypto_shash_init(desc); } EXPORT_SYMBOL_GPL(crypto_shash_init); static int shash_default_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct shash_alg *shash = crypto_shash_alg(desc->tfm); return shash->update(desc, data, len) ?: shash->final(desc, out); } static int crypto_shash_op_and_zero( int (*op)(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out), struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { int err; err = op(desc, data, len, out); memset(shash_desc_ctx(desc), 0, crypto_shash_descsize(desc->tfm)); return err; } int crypto_shash_finup(struct shash_desc *restrict desc, const u8 *data, unsigned int len, u8 *restrict out) { struct crypto_shash *tfm = desc->tfm; u8 *blenp = shash_desc_ctx(desc); bool finup_max, nonzero; unsigned int bs; int err; u8 *buf; if (!crypto_shash_block_only(tfm)) { if (out) goto finup; return crypto_shash_alg(tfm)->update(desc, data, len); } finup_max = out && crypto_shash_finup_max(tfm); /* Retain extra block for final nonzero algorithms. */ nonzero = crypto_shash_final_nonzero(tfm); /* * The partial block buffer follows the algorithm desc context. * The byte following that contains the length. */ blenp += crypto_shash_descsize(tfm) - 1; bs = crypto_shash_blocksize(tfm); buf = blenp - bs; if (likely(!*blenp && finup_max)) goto finup; while ((*blenp + len) >= bs + nonzero) { unsigned int nbytes = len - nonzero; const u8 *src = data; if (*blenp) { memcpy(buf + *blenp, data, bs - *blenp); nbytes = bs; src = buf; } err = crypto_shash_alg(tfm)->update(desc, src, nbytes); if (err < 0) return err; data += nbytes - err - *blenp; len -= nbytes - err - *blenp; *blenp = 0; } if (*blenp || !out) { memcpy(buf + *blenp, data, len); *blenp += len; if (!out) return 0; data = buf; len = *blenp; } finup: return crypto_shash_op_and_zero(crypto_shash_alg(tfm)->finup, desc, data, len, out); } EXPORT_SYMBOL_GPL(crypto_shash_finup); static int shash_default_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return __crypto_shash_init(desc) ?: crypto_shash_finup(desc, data, len, out); } int crypto_shash_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct crypto_shash *tfm = desc->tfm; if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return crypto_shash_op_and_zero(crypto_shash_alg(tfm)->digest, desc, data, len, out); } EXPORT_SYMBOL_GPL(crypto_shash_digest); int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data, unsigned int len, u8 *out) { SHASH_DESC_ON_STACK(desc, tfm); desc->tfm = tfm; return crypto_shash_digest(desc, data, len, out); } EXPORT_SYMBOL_GPL(crypto_shash_tfm_digest); static int __crypto_shash_export(struct shash_desc *desc, void *out, int (*export)(struct shash_desc *desc, void *out)) { struct crypto_shash *tfm = desc->tfm; u8 *buf = shash_desc_ctx(desc); unsigned int plen, ss; plen = crypto_shash_blocksize(tfm) + 1; ss = crypto_shash_statesize(tfm); if (crypto_shash_block_only(tfm)) ss -= plen; if (!export) { memcpy(out, buf, ss); return 0; } return export(desc, out); } int crypto_shash_export_core(struct shash_desc *desc, void *out) { return __crypto_shash_export(desc, out, crypto_shash_alg(desc->tfm)->export_core); } EXPORT_SYMBOL_GPL(crypto_shash_export_core); int crypto_shash_export(struct shash_desc *desc, void *out) { struct crypto_shash *tfm = desc->tfm; if (crypto_shash_block_only(tfm)) { unsigned int plen = crypto_shash_blocksize(tfm) + 1; unsigned int descsize = crypto_shash_descsize(tfm); unsigned int ss = crypto_shash_statesize(tfm); u8 *buf = shash_desc_ctx(desc); memcpy(out + ss - plen, buf + descsize - plen, plen); } return __crypto_shash_export(desc, out, crypto_shash_alg(tfm)->export); } EXPORT_SYMBOL_GPL(crypto_shash_export); static int __crypto_shash_import(struct shash_desc *desc, const void *in, int (*import)(struct shash_desc *desc, const void *in)) { struct crypto_shash *tfm = desc->tfm; unsigned int descsize, plen, ss; u8 *buf = shash_desc_ctx(desc); if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; ss = crypto_shash_statesize(tfm); if (crypto_shash_block_only(tfm)) { plen = crypto_shash_blocksize(tfm) + 1; ss -= plen; descsize = crypto_shash_descsize(tfm); buf[descsize - 1] = 0; } if (!import) { memcpy(buf, in, ss); return 0; } return import(desc, in); } int crypto_shash_import_core(struct shash_desc *desc, const void *in) { return __crypto_shash_import(desc, in, crypto_shash_alg(desc->tfm)->import_core); } EXPORT_SYMBOL_GPL(crypto_shash_import_core); int crypto_shash_import(struct shash_desc *desc, const void *in) { struct crypto_shash *tfm = desc->tfm; int err; err = __crypto_shash_import(desc, in, crypto_shash_alg(tfm)->import); if (crypto_shash_block_only(tfm)) { unsigned int plen = crypto_shash_blocksize(tfm) + 1; unsigned int descsize = crypto_shash_descsize(tfm); unsigned int ss = crypto_shash_statesize(tfm); u8 *buf = shash_desc_ctx(desc); memcpy(buf + descsize - plen, in + ss - plen, plen); if (buf[descsize - 1] >= plen) err = -EOVERFLOW; } return err; } EXPORT_SYMBOL_GPL(crypto_shash_import); static void crypto_shash_exit_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); alg->exit_tfm(hash); } static int crypto_shash_init_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); shash_set_needkey(hash, alg); if (alg->exit_tfm) tfm->exit = crypto_shash_exit_tfm; if (!alg->init_tfm) return 0; return alg->init_tfm(hash); } static void crypto_shash_free_instance(struct crypto_instance *inst) { struct shash_instance *shash = shash_instance(inst); shash->free(shash); } static int __maybe_unused crypto_shash_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_hash rhash; struct shash_alg *salg = __crypto_shash_alg(alg); memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "shash", sizeof(rhash.type)); rhash.blocksize = alg->cra_blocksize; rhash.digestsize = salg->digestsize; return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash); } static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) { struct shash_alg *salg = __crypto_shash_alg(alg); seq_printf(m, "type : shash\n"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "digestsize : %u\n", salg->digestsize); } const struct crypto_type crypto_shash_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_shash_init_tfm, .free = crypto_shash_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_shash_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_shash_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SHASH, .tfmsize = offsetof(struct crypto_shash, base), .algsize = offsetof(struct shash_alg, base), }; int crypto_grab_shash(struct crypto_shash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_shash_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_shash); struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_shash); int crypto_has_shash(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_shash); struct crypto_shash *crypto_clone_shash(struct crypto_shash *hash) { struct crypto_tfm *tfm = crypto_shash_tfm(hash); struct shash_alg *alg = crypto_shash_alg(hash); struct crypto_shash *nhash; int err; if (!crypto_shash_alg_has_setkey(alg)) { tfm = crypto_tfm_get(tfm); if (IS_ERR(tfm)) return ERR_CAST(tfm); return hash; } if (!alg->clone_tfm && (alg->init_tfm || alg->base.cra_init)) return ERR_PTR(-ENOSYS); nhash = crypto_clone_tfm(&crypto_shash_type, tfm); if (IS_ERR(nhash)) return nhash; if (alg->clone_tfm) { err = alg->clone_tfm(nhash, hash); if (err) { crypto_free_shash(nhash); return ERR_PTR(err); } } if (alg->exit_tfm) crypto_shash_tfm(nhash)->exit = crypto_shash_exit_tfm; return nhash; } EXPORT_SYMBOL_GPL(crypto_clone_shash); int hash_prepare_alg(struct hash_alg_common *alg) { struct crypto_alg *base = &alg->base; if (alg->digestsize > HASH_MAX_DIGESTSIZE) return -EINVAL; /* alignmask is not useful for hashes, so it is not supported. */ if (base->cra_alignmask) return -EINVAL; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; return 0; } static int shash_default_export_core(struct shash_desc *desc, void *out) { return -ENOSYS; } static int shash_default_import_core(struct shash_desc *desc, const void *in) { return -ENOSYS; } static int shash_prepare_alg(struct shash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; if ((alg->export && !alg->import) || (alg->import && !alg->export)) return -EINVAL; err = hash_prepare_alg(&alg->halg); if (err) return err; base->cra_type = &crypto_shash_type; base->cra_flags |= CRYPTO_ALG_TYPE_SHASH; base->cra_flags |= CRYPTO_ALG_REQ_VIRT; /* * Handle missing optional functions. For each one we can either * install a default here, or we can leave the pointer as NULL and check * the pointer for NULL in crypto_shash_*(), avoiding an indirect call * when the default behavior is desired. For ->finup and ->digest we * install defaults, since for optimal performance algorithms should * implement these anyway. On the other hand, for ->import and * ->export the common case and best performance comes from the simple * memcpy of the shash_desc_ctx, so when those pointers are NULL we * leave them NULL and provide the memcpy with no indirect call. */ if (!alg->finup) alg->finup = shash_default_finup; if (!alg->digest) alg->digest = shash_default_digest; if (!alg->export && !alg->halg.statesize) alg->halg.statesize = alg->descsize; if (!alg->setkey) alg->setkey = shash_no_setkey; if (base->cra_flags & CRYPTO_AHASH_ALG_BLOCK_ONLY) { BUILD_BUG_ON(MAX_ALGAPI_BLOCKSIZE >= 256); alg->descsize += base->cra_blocksize + 1; alg->statesize += base->cra_blocksize + 1; alg->export_core = alg->export; alg->import_core = alg->import; } else if (!alg->export_core || !alg->import_core) { alg->export_core = shash_default_export_core; alg->import_core = shash_default_import_core; base->cra_flags |= CRYPTO_AHASH_ALG_NO_EXPORT_CORE; } if (alg->descsize > HASH_MAX_DESCSIZE) return -EINVAL; if (alg->statesize > HASH_MAX_STATESIZE) return -EINVAL; base->cra_reqsize = sizeof(struct shash_desc) + alg->descsize; return 0; } int crypto_register_shash(struct shash_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = shash_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_shash); void crypto_unregister_shash(struct shash_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_shash); int crypto_register_shashes(struct shash_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_shash(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_shash(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_shashes); void crypto_unregister_shashes(struct shash_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_shash(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_shashes); int shash_register_instance(struct crypto_template *tmpl, struct shash_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = shash_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, shash_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(shash_register_instance); void shash_free_singlespawn_instance(struct shash_instance *inst) { crypto_drop_spawn(shash_instance_ctx(inst)); kfree(inst); } EXPORT_SYMBOL_GPL(shash_free_singlespawn_instance); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Synchronous cryptographic hash type");
619 1 619 619 619 3601 3602 617 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4584 4577 2840 2840 1702 222 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 // SPDX-License-Identifier: GPL-2.0-only #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <net/busy_poll.h> #include <net/net_namespace.h> #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> #include <net/page_pool/memory_provider.h> #include "dev.h" #include "devmem.h" #include "netdev-genl-gen.h" struct netdev_nl_dump_ctx { unsigned long ifindex; unsigned int rxq_idx; unsigned int txq_idx; unsigned int napi_id; }; static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) { NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx); return (struct netdev_nl_dump_ctx *)cb->ctx; } static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */ hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; #define XDP_METADATA_KFUNC(_, flag, __, xmo) \ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ xdp_rx_meta |= flag; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC if (netdev->xsk_tx_metadata_ops) { if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time) xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO; } if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, xdp_rx_meta, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, xsk_features, NETDEV_A_DEV_PAD)) goto err_cancel_msg; if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, netdev->xdp_zc_max_segs)) goto err_cancel_msg; } genlmsg_end(rsp, hdr); return 0; err_cancel_msg: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static void netdev_genl_dev_notify(struct net_device *netdev, int cmd) { struct genl_info info; struct sk_buff *ntf; if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), NETDEV_NLGRP_MGMT)) return; genl_info_init_ntf(&info, &netdev_nl_family, cmd); ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; if (netdev_nl_dev_fill(netdev, ntf, &info)) { nlmsg_free(ntf); return; } genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf, 0, NETDEV_NLGRP_MGMT, GFP_KERNEL); } int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_device *netdev; struct sk_buff *rsp; u32 ifindex; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); if (!netdev) { err = -ENODEV; goto err_free_msg; } err = netdev_nl_dev_fill(netdev, rsp, info); netdev_unlock(netdev); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); int err; for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) return err; } return 0; } static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { unsigned long irq_suspend_timeout; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; pid_t pid; if (!napi->dev->up) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) goto nla_put_failure; if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) goto nla_put_failure; if (nla_put_uint(rsp, NETDEV_A_NAPI_THREADED, napi_get_threaded(napi))) goto nla_put_failure; if (napi->thread) { pid = task_pid_nr(napi->thread); if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) goto nla_put_failure; } napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi); if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS, napi_defer_hard_irqs)) goto nla_put_failure; irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, irq_suspend_timeout)) goto nla_put_failure; gro_flush_timeout = napi_get_gro_flush_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, gro_flush_timeout)) goto nla_put_failure; genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) { struct napi_struct *napi; struct sk_buff *rsp; u32 napi_id; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) return -EINVAL; napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); netdev_unlock(napi->dev); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); err = -ENOENT; } if (err) { goto err_free_msg; } else if (!rsp->len) { err = -ENOENT; goto err_free_msg; } return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { struct napi_struct *napi; unsigned int prev_id; int err = 0; if (!netdev->up) return err; prev_id = UINT_MAX; list_for_each_entry(napi, &netdev->napi_list, dev_list) { if (!napi_id_valid(napi->napi_id)) continue; /* Dump continuation below depends on the list being sorted */ WARN_ON_ONCE(napi->napi_id >= prev_id); prev_id = napi->napi_id; if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; err = netdev_nl_napi_fill_one(rsp, napi, info); if (err) return err; ctx->napi_id = napi->napi_id; } return err; } int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_NAPI_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); if (ifindex) { netdev = netdev_get_by_index_lock(net, ifindex); if (netdev) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); netdev_unlock(netdev); } else { err = -ENODEV; } } else { for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->napi_id = 0; } } return err; } static int netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; u8 threaded = 0; u32 defer = 0; if (info->attrs[NETDEV_A_NAPI_THREADED]) { int ret; threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]); ret = napi_set_threaded(napi, threaded); if (ret) return ret; } if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); napi_set_defer_hard_irqs(napi, defer); } if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); } if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); napi_set_gro_flush_timeout(napi, gro_flush_timeout); } return 0; } int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) { struct napi_struct *napi; unsigned int napi_id; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) return -EINVAL; napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_set_config(napi, info); netdev_unlock(napi->dev); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); err = -ENOENT; } return err; } static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi) { if (napi && napi_id_valid(napi->napi_id)) return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id); return 0; } static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct pp_memory_provider_params *params; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; params = &rxq->mp_params; if (params->mp_ops && params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) goto nla_put_failure; #ifdef CONFIG_XDP_SOCKETS if (rxq->pool) if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) goto nla_put_failure; #endif break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); if (nla_put_napi_id(rsp, txq->napi)) goto nla_put_failure; #ifdef CONFIG_XDP_SOCKETS if (txq->pool) if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) goto nla_put_failure; #endif break; } genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, u32 q_type) { switch (q_type) { case NETDEV_QUEUE_TYPE_RX: if (q_id >= netdev->real_num_rx_queues) return -EINVAL; return 0; case NETDEV_QUEUE_TYPE_TX: if (q_id >= netdev->real_num_tx_queues) return -EINVAL; } return 0; } static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { int err; if (!netdev->up) return -ENOENT; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) return err; return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); } int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) { u32 q_id, q_type, ifindex; struct net_device *netdev; struct sk_buff *rsp; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) return -EINVAL; q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info), ifindex); if (netdev) { err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { int err = 0; if (!netdev->up) return err; for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; } for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; } return err; } int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); if (ifindex) { netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); if (netdev) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } } else { for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->rxq_idx = 0; ctx->txq_idx = 0; } } return err; } #define NETDEV_STAT_NOT_SET (~0ULL) static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size) { const u64 *add = _add; u64 *sum = _sum; while (size) { if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET) *sum += *add; sum++; add++; size -= 8; } } static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value) { if (value == NETDEV_STAT_NOT_SET) return 0; return nla_put_uint(rsp, attr_id, value); } static int netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) return -EMSGSIZE; return 0; } static int netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake)) return -EMSGSIZE; return 0; } static int netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp, u32 q_type, int i, const struct genl_info *info) { const struct netdev_stat_ops *ops = netdev->stat_ops; struct netdev_queue_stats_rx rx; struct netdev_queue_stats_tx tx; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) || nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: memset(&rx, 0xff, sizeof(rx)); ops->get_queue_stats_rx(netdev, i, &rx); if (!memchr_inv(&rx, 0xff, sizeof(rx))) goto nla_cancel; if (netdev_nl_stats_write_rx(rsp, &rx)) goto nla_put_failure; break; case NETDEV_QUEUE_TYPE_TX: memset(&tx, 0xff, sizeof(tx)); ops->get_queue_stats_tx(netdev, i, &tx); if (!memchr_inv(&tx, 0xff, sizeof(tx))) goto nla_cancel; if (netdev_nl_stats_write_tx(rsp, &tx)) goto nla_put_failure; break; } genlmsg_end(rsp, hdr); return 0; nla_cancel: genlmsg_cancel(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { const struct netdev_stat_ops *ops = netdev->stat_ops; int i, err; if (!(netdev->flags & IFF_UP)) return 0; i = ctx->rxq_idx; while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) { err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, i, info); if (err) return err; ctx->rxq_idx = ++i; } i = ctx->txq_idx; while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX, i, info); if (err) return err; ctx->txq_idx = ++i; } ctx->rxq_idx = 0; ctx->txq_idx = 0; return 0; } /** * netdev_stat_queue_sum() - add up queue stats from range of queues * @netdev: net_device * @rx_start: index of the first Rx queue to query * @rx_end: index after the last Rx queue (first *not* to query) * @rx_sum: output Rx stats, should be already initialized * @tx_start: index of the first Tx queue to query * @tx_end: index after the last Tx queue (first *not* to query) * @tx_sum: output Tx stats, should be already initialized * * Add stats from [start, end) range of queue IDs to *x_sum structs. * The sum structs must be already initialized. Usually this * helper is invoked from the .get_base_stats callbacks of drivers * to account for stats of disabled queues. In that case the ranges * are usually [netdev->real_num_*x_queues, netdev->num_*x_queues). */ void netdev_stat_queue_sum(struct net_device *netdev, int rx_start, int rx_end, struct netdev_queue_stats_rx *rx_sum, int tx_start, int tx_end, struct netdev_queue_stats_tx *tx_sum) { const struct netdev_stat_ops *ops; struct netdev_queue_stats_rx rx; struct netdev_queue_stats_tx tx; int i; ops = netdev->stat_ops; for (i = rx_start; i < rx_end; i++) { memset(&rx, 0xff, sizeof(rx)); if (ops->get_queue_stats_rx) ops->get_queue_stats_rx(netdev, i, &rx); netdev_nl_stats_add(rx_sum, &rx, sizeof(rx)); } for (i = tx_start; i < tx_end; i++) { memset(&tx, 0xff, sizeof(tx)); if (ops->get_queue_stats_tx) ops->get_queue_stats_tx(netdev, i, &tx); netdev_nl_stats_add(tx_sum, &tx, sizeof(tx)); } } EXPORT_SYMBOL(netdev_stat_queue_sum); static int netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { struct netdev_queue_stats_rx rx_sum; struct netdev_queue_stats_tx tx_sum; void *hdr; /* Netdev can't guarantee any complete counters */ if (!netdev->stat_ops->get_base_stats) return 0; memset(&rx_sum, 0xff, sizeof(rx_sum)); memset(&tx_sum, 0xff, sizeof(tx_sum)); netdev->stat_ops->get_base_stats(netdev, &rx_sum, &tx_sum); /* The op was there, but nothing reported, don't bother */ if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum))) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) goto nla_put_failure; netdev_stat_queue_sum(netdev, 0, netdev->real_num_rx_queues, &rx_sum, 0, netdev->real_num_tx_queues, &tx_sum); if (netdev_nl_stats_write_rx(rsp, &rx_sum) || netdev_nl_stats_write_tx(rsp, &tx_sum)) goto nla_put_failure; genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, struct sk_buff *skb, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { if (!netdev->stat_ops) return 0; switch (scope) { case 0: return netdev_nl_stats_by_netdev(netdev, skb, info); case NETDEV_QSTATS_SCOPE_QUEUE: return netdev_nl_stats_by_queue(netdev, skb, info, ctx); } return -EINVAL; /* Should not happen, per netlink policy */ } int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; unsigned int ifindex; unsigned int scope; int err = 0; scope = 0; if (info->attrs[NETDEV_A_QSTATS_SCOPE]) scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); ifindex = 0; if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); if (ifindex) { netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); if (!netdev) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); return -ENODEV; } if (netdev->stat_ops) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); err = -EOPNOTSUPP; } netdev_unlock_ops_compat(netdev); return err; } for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); if (err < 0) break; } return err; } static int netdev_nl_read_rxq_bitmap(struct genl_info *info, u32 rxq_bitmap_len, unsigned long *rxq_bitmap) { const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; struct nlattr *attr; int rem, err = 0; u32 rxq_idx; nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, genlmsg_data(info->genlhdr), genlmsg_len(info->genlhdr), rem) { err = nla_parse_nested(tb, maxtype, attr, netdev_queue_id_nl_policy, info->extack); if (err < 0) return err; if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) return -EINVAL; if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); return -EINVAL; } rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); if (rxq_idx >= rxq_bitmap_len) { NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]); return -EINVAL; } bitmap_set(rxq_bitmap, rxq_idx, 1); } return 0; } static struct device * netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap, struct netlink_ext_ack *extack) { struct device *dma_dev = NULL; u32 rxq_idx, prev_rxq_idx; for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { struct device *rxq_dma_dev; rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx); if (dma_dev && rxq_dma_dev != dma_dev) { NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)", rxq_idx, prev_rxq_idx); return ERR_PTR(-EOPNOTSUPP); } dma_dev = rxq_dma_dev; prev_rxq_idx = rxq_idx; } return dma_dev; } int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) { struct net_devmem_dmabuf_binding *binding; u32 ifindex, dmabuf_fd, rxq_idx; struct netdev_nl_sock *priv; struct net_device *netdev; unsigned long *rxq_bitmap; struct device *dma_dev; struct sk_buff *rsp; int err = 0; void *hdr; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); if (IS_ERR(priv)) return PTR_ERR(priv); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; hdr = genlmsg_iput(rsp, info); if (!hdr) { err = -EMSGSIZE; goto err_genlmsg_free; } mutex_lock(&priv->lock); err = 0; netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); if (!netdev) { err = -ENODEV; goto err_unlock_sock; } if (!netif_device_present(netdev)) err = -ENODEV; else if (!netdev_need_ops_lock(netdev)) err = -EOPNOTSUPP; if (err) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_DEV_IFINDEX]); goto err_unlock; } rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL); if (!rxq_bitmap) { err = -ENOMEM; goto err_unlock; } err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues, rxq_bitmap); if (err) goto err_rxq_bitmap; dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack); if (IS_ERR(dma_dev)) { err = PTR_ERR(dma_dev); goto err_rxq_bitmap; } binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_rxq_bitmap; } for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, info->extack); if (err) goto err_unbind; } nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); genlmsg_end(rsp, hdr); err = genlmsg_reply(rsp, info); if (err) goto err_unbind; bitmap_free(rxq_bitmap); netdev_unlock(netdev); mutex_unlock(&priv->lock); return 0; err_unbind: net_devmem_unbind_dmabuf(binding); err_rxq_bitmap: bitmap_free(rxq_bitmap); err_unlock: netdev_unlock(netdev); err_unlock_sock: mutex_unlock(&priv->lock); err_genlmsg_free: nlmsg_free(rsp); return err; } int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) { struct net_devmem_dmabuf_binding *binding; struct netdev_nl_sock *priv; struct net_device *netdev; struct device *dma_dev; u32 ifindex, dmabuf_fd; struct sk_buff *rsp; int err = 0; void *hdr; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); if (IS_ERR(priv)) return PTR_ERR(priv); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; hdr = genlmsg_iput(rsp, info); if (!hdr) { err = -EMSGSIZE; goto err_genlmsg_free; } mutex_lock(&priv->lock); netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); if (!netdev) { err = -ENODEV; goto err_unlock_sock; } if (!netif_device_present(netdev)) { err = -ENODEV; goto err_unlock_netdev; } if (!netdev->netmem_tx) { err = -EOPNOTSUPP; NL_SET_ERR_MSG(info->extack, "Driver does not support netmem TX"); goto err_unlock_netdev; } dma_dev = netdev_queue_get_dma_dev(netdev, 0); binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock_netdev; } nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); genlmsg_end(rsp, hdr); netdev_unlock(netdev); mutex_unlock(&priv->lock); return genlmsg_reply(rsp, info); err_unlock_netdev: netdev_unlock(netdev); err_unlock_sock: mutex_unlock(&priv->lock); err_genlmsg_free: nlmsg_free(rsp); return err; } void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); mutex_init(&priv->lock); } void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv) { struct net_devmem_dmabuf_binding *binding; struct net_devmem_dmabuf_binding *temp; netdevice_tracker dev_tracker; struct net_device *dev; mutex_lock(&priv->lock); list_for_each_entry_safe(binding, temp, &priv->bindings, list) { mutex_lock(&binding->lock); dev = binding->dev; if (!dev) { mutex_unlock(&binding->lock); net_devmem_unbind_dmabuf(binding); continue; } netdev_hold(dev, &dev_tracker, GFP_KERNEL); mutex_unlock(&binding->lock); netdev_lock(dev); net_devmem_unbind_dmabuf(binding); netdev_unlock(dev); netdev_put(dev, &dev_tracker); } mutex_unlock(&priv->lock); } static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REGISTER: netdev_lock_ops_to_full(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); netdev_unlock_full_to_ops(netdev); break; case NETDEV_UNREGISTER: netdev_lock(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); netdev_unlock(netdev); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); break; } return NOTIFY_OK; } static struct notifier_block netdev_genl_nb = { .notifier_call = netdev_genl_netdevice_event, }; static int __init netdev_genl_init(void) { int err; err = register_netdevice_notifier(&netdev_genl_nb); if (err) return err; err = genl_register_family(&netdev_nl_family); if (err) goto err_unreg_ntf; return 0; err_unreg_ntf: unregister_netdevice_notifier(&netdev_genl_nb); return err; } subsys_initcall(netdev_genl_init);
3 2656 3 2658 2587 2656 2584 2658 3109 60 55 410 404 1357 874 1358 874 4678 2051 2353 2515 2262 60 60 59 55 60 59 46 60 630 629 991 7 370 341 3550 951 5041 5045 3122 4287 994 420 272 205 420 60 369 370 36 2 356 356 2 356 356 356 148 148 37 1 36 36 36 2236 2563 2233 5036 848 4254 1048 425 9 724 319 319 255 77 1004 1005 1008 909 683 228 229 1045 1047 1050 4069 30 3298 813 3298 717 2 385 1 1 1 388 7 3498 5039 1047 4071 5044 39 5042 5054 5044 5047 393 393 2 3 1 1 1 393 393 4 3 93 1 310 1 2 10 10 10 10 10 10 384 392 391 2 3 390 6 9 385 384 384 46 1 21 24 2 4 2 3 3 2 1 4 32 30 29 31 7 32 33 33 33 33 1 1 7 32 31 9 13 34 7 4 29 20 17 16 1 9 9 20 38 38 37 37 38 36 36 37 7 7 7 7 7 5 540 540 7 68 68 32 46 67 68 31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Generic Netlink Family * * Authors: Jamal Hadi Salim * Thomas Graf <tgraf@suug.ch> * Johannes Berg <johannes@sipsolutions.net> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/string_helpers.h> #include <linux/skbuff.h> #include <linux/mutex.h> #include <linux/bitmap.h> #include <linux/rwsem.h> #include <linux/idr.h> #include <net/sock.h> #include <net/genetlink.h> #include "genetlink.h" static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ static DECLARE_RWSEM(cb_lock); atomic_t genl_sk_destructing_cnt = ATOMIC_INIT(0); DECLARE_WAIT_QUEUE_HEAD(genl_sk_destructing_waitq); void genl_lock(void) { mutex_lock(&genl_mutex); } EXPORT_SYMBOL(genl_lock); void genl_unlock(void) { mutex_unlock(&genl_mutex); } EXPORT_SYMBOL(genl_unlock); static void genl_lock_all(void) { down_write(&cb_lock); genl_lock(); } static void genl_unlock_all(void) { genl_unlock(); up_write(&cb_lock); } static void genl_op_lock(const struct genl_family *family) { if (!family->parallel_ops) genl_lock(); } static void genl_op_unlock(const struct genl_family *family) { if (!family->parallel_ops) genl_unlock(); } static DEFINE_IDR(genl_fam_idr); /* * Bitmap of multicast groups that are currently in use. * * To avoid an allocation at boot of just one unsigned long, * declare it global instead. * Bit 0 is marked as already used since group 0 is invalid. * Bit 1 is marked as already used since the drop-monitor code * abuses the API and thinks it can statically use group 1. * That group will typically conflict with other groups that * any proper users use. * Bit 16 is marked as used since it's used for generic netlink * and the code no longer marks pre-reserved IDs as used. * Bit 17 is marked as already used since the VFS quota code * also abused this API and relied on family == group ID, we * cater to that by giving it a static family and group ID. * Bit 18 is marked as already used since the PMCRAID driver * did the same thing as the VFS quota code (maybe copied?) */ static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) | BIT(GENL_ID_VFS_DQUOT) | BIT(GENL_ID_PMCRAID); static unsigned long *mc_groups = &mc_group_start; static unsigned long mc_groups_longs = 1; /* We need the last attribute with non-zero ID therefore a 2-entry array */ static struct nla_policy genl_policy_reject_all[] = { { .type = NLA_REJECT }, { .type = NLA_REJECT }, }; static int genl_ctrl_event(int event, const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id); static void genl_op_fill_in_reject_policy(const struct genl_family *family, struct genl_ops *op) { BUILD_BUG_ON(ARRAY_SIZE(genl_policy_reject_all) - 1 != 1); if (op->policy || op->cmd < family->resv_start_op) return; op->policy = genl_policy_reject_all; op->maxattr = 1; } static void genl_op_fill_in_reject_policy_split(const struct genl_family *family, struct genl_split_ops *op) { if (op->policy) return; op->policy = genl_policy_reject_all; op->maxattr = 1; } static const struct genl_family *genl_family_find_byid(unsigned int id) { return idr_find(&genl_fam_idr, id); } static const struct genl_family *genl_family_find_byname(char *name) { const struct genl_family *family; unsigned int id; idr_for_each_entry(&genl_fam_idr, family, id) if (strcmp(family->name, name) == 0) return family; return NULL; } struct genl_op_iter { const struct genl_family *family; struct genl_split_ops doit; struct genl_split_ops dumpit; int cmd_idx; int entry_idx; u32 cmd; u8 flags; }; static void genl_op_from_full(const struct genl_family *family, unsigned int i, struct genl_ops *op) { *op = family->ops[i]; if (!op->maxattr) op->maxattr = family->maxattr; if (!op->policy) op->policy = family->policy; genl_op_fill_in_reject_policy(family, op); } static int genl_get_cmd_full(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; for (i = 0; i < family->n_ops; i++) if (family->ops[i].cmd == cmd) { genl_op_from_full(family, i, op); return 0; } return -ENOENT; } static void genl_op_from_small(const struct genl_family *family, unsigned int i, struct genl_ops *op) { memset(op, 0, sizeof(*op)); op->doit = family->small_ops[i].doit; op->dumpit = family->small_ops[i].dumpit; op->cmd = family->small_ops[i].cmd; op->internal_flags = family->small_ops[i].internal_flags; op->flags = family->small_ops[i].flags; op->validate = family->small_ops[i].validate; op->maxattr = family->maxattr; op->policy = family->policy; genl_op_fill_in_reject_policy(family, op); } static int genl_get_cmd_small(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; for (i = 0; i < family->n_small_ops; i++) if (family->small_ops[i].cmd == cmd) { genl_op_from_small(family, i, op); return 0; } return -ENOENT; } static void genl_op_from_split(struct genl_op_iter *iter) { const struct genl_family *family = iter->family; int i, cnt = 0; i = iter->entry_idx - family->n_ops - family->n_small_ops; if (family->split_ops[i + cnt].flags & GENL_CMD_CAP_DO) { iter->doit = family->split_ops[i + cnt]; genl_op_fill_in_reject_policy_split(family, &iter->doit); cnt++; } else { memset(&iter->doit, 0, sizeof(iter->doit)); } if (i + cnt < family->n_split_ops && family->split_ops[i + cnt].flags & GENL_CMD_CAP_DUMP && (!cnt || family->split_ops[i + cnt].cmd == iter->doit.cmd)) { iter->dumpit = family->split_ops[i + cnt]; genl_op_fill_in_reject_policy_split(family, &iter->dumpit); cnt++; } else { memset(&iter->dumpit, 0, sizeof(iter->dumpit)); } WARN_ON(!cnt); iter->entry_idx += cnt; } static int genl_get_cmd_split(u32 cmd, u8 flag, const struct genl_family *family, struct genl_split_ops *op) { int i; for (i = 0; i < family->n_split_ops; i++) if (family->split_ops[i].cmd == cmd && family->split_ops[i].flags & flag) { *op = family->split_ops[i]; return 0; } return -ENOENT; } static int genl_cmd_full_to_split(struct genl_split_ops *op, const struct genl_family *family, const struct genl_ops *full, u8 flags) { if ((flags & GENL_CMD_CAP_DO && !full->doit) || (flags & GENL_CMD_CAP_DUMP && !full->dumpit)) { memset(op, 0, sizeof(*op)); return -ENOENT; } if (flags & GENL_CMD_CAP_DUMP) { op->start = full->start; op->dumpit = full->dumpit; op->done = full->done; } else { op->pre_doit = family->pre_doit; op->doit = full->doit; op->post_doit = family->post_doit; } if (flags & GENL_CMD_CAP_DUMP && full->validate & GENL_DONT_VALIDATE_DUMP) { op->policy = NULL; op->maxattr = 0; } else { op->policy = full->policy; op->maxattr = full->maxattr; } op->cmd = full->cmd; op->internal_flags = full->internal_flags; op->flags = full->flags; op->validate = full->validate; /* Make sure flags include the GENL_CMD_CAP_DO / GENL_CMD_CAP_DUMP */ op->flags |= flags; return 0; } /* Must make sure that op is initialized to 0 on failure */ static int genl_get_cmd(u32 cmd, u8 flags, const struct genl_family *family, struct genl_split_ops *op) { struct genl_ops full; int err; err = genl_get_cmd_full(cmd, family, &full); if (err == -ENOENT) err = genl_get_cmd_small(cmd, family, &full); /* Found one of legacy forms */ if (err == 0) return genl_cmd_full_to_split(op, family, &full, flags); err = genl_get_cmd_split(cmd, flags, family, op); if (err) memset(op, 0, sizeof(*op)); return err; } /* For policy dumping only, get ops of both do and dump. * Fail if both are missing, genl_get_cmd() will zero-init in case of failure. */ static int genl_get_cmd_both(u32 cmd, const struct genl_family *family, struct genl_split_ops *doit, struct genl_split_ops *dumpit) { int err1, err2; err1 = genl_get_cmd(cmd, GENL_CMD_CAP_DO, family, doit); err2 = genl_get_cmd(cmd, GENL_CMD_CAP_DUMP, family, dumpit); return err1 && err2 ? -ENOENT : 0; } static bool genl_op_iter_init(const struct genl_family *family, struct genl_op_iter *iter) { iter->family = family; iter->cmd_idx = 0; iter->entry_idx = 0; iter->flags = 0; return iter->family->n_ops + iter->family->n_small_ops + iter->family->n_split_ops; } static bool genl_op_iter_next(struct genl_op_iter *iter) { const struct genl_family *family = iter->family; bool legacy_op = true; struct genl_ops op; if (iter->entry_idx < family->n_ops) { genl_op_from_full(family, iter->entry_idx, &op); } else if (iter->entry_idx < family->n_ops + family->n_small_ops) { genl_op_from_small(family, iter->entry_idx - family->n_ops, &op); } else if (iter->entry_idx < family->n_ops + family->n_small_ops + family->n_split_ops) { legacy_op = false; /* updates entry_idx */ genl_op_from_split(iter); } else { return false; } iter->cmd_idx++; if (legacy_op) { iter->entry_idx++; genl_cmd_full_to_split(&iter->doit, family, &op, GENL_CMD_CAP_DO); genl_cmd_full_to_split(&iter->dumpit, family, &op, GENL_CMD_CAP_DUMP); } iter->cmd = iter->doit.cmd | iter->dumpit.cmd; iter->flags = iter->doit.flags | iter->dumpit.flags; return true; } static void genl_op_iter_copy(struct genl_op_iter *dst, struct genl_op_iter *src) { *dst = *src; } static unsigned int genl_op_iter_idx(struct genl_op_iter *iter) { return iter->cmd_idx; } static int genl_allocate_reserve_groups(int n_groups, int *first_id) { unsigned long *new_groups; int start = 0; int i; int id; bool fits; do { if (start == 0) id = find_first_zero_bit(mc_groups, mc_groups_longs * BITS_PER_LONG); else id = find_next_zero_bit(mc_groups, mc_groups_longs * BITS_PER_LONG, start); fits = true; for (i = id; i < min_t(int, id + n_groups, mc_groups_longs * BITS_PER_LONG); i++) { if (test_bit(i, mc_groups)) { start = i; fits = false; break; } } if (id + n_groups > mc_groups_longs * BITS_PER_LONG) { unsigned long new_longs = mc_groups_longs + BITS_TO_LONGS(n_groups); size_t nlen = new_longs * sizeof(unsigned long); if (mc_groups == &mc_group_start) { new_groups = kzalloc(nlen, GFP_KERNEL); if (!new_groups) return -ENOMEM; mc_groups = new_groups; *mc_groups = mc_group_start; } else { new_groups = krealloc(mc_groups, nlen, GFP_KERNEL); if (!new_groups) return -ENOMEM; mc_groups = new_groups; for (i = 0; i < BITS_TO_LONGS(n_groups); i++) mc_groups[mc_groups_longs + i] = 0; } mc_groups_longs = new_longs; } } while (!fits); for (i = id; i < id + n_groups; i++) set_bit(i, mc_groups); *first_id = id; return 0; } static struct genl_family genl_ctrl; static int genl_validate_assign_mc_groups(struct genl_family *family) { int first_id; int n_groups = family->n_mcgrps; int err = 0, i; bool groups_allocated = false; if (!n_groups) return 0; for (i = 0; i < n_groups; i++) { const struct genl_multicast_group *grp = &family->mcgrps[i]; if (WARN_ON(grp->name[0] == '\0')) return -EINVAL; if (WARN_ON(!string_is_terminated(grp->name, GENL_NAMSIZ))) return -EINVAL; } /* special-case our own group and hacks */ if (family == &genl_ctrl) { first_id = GENL_ID_CTRL; BUG_ON(n_groups != 1); } else if (strcmp(family->name, "NET_DM") == 0) { first_id = 1; BUG_ON(n_groups != 1); } else if (family->id == GENL_ID_VFS_DQUOT) { first_id = GENL_ID_VFS_DQUOT; BUG_ON(n_groups != 1); } else if (family->id == GENL_ID_PMCRAID) { first_id = GENL_ID_PMCRAID; BUG_ON(n_groups != 1); } else { groups_allocated = true; err = genl_allocate_reserve_groups(n_groups, &first_id); if (err) return err; } family->mcgrp_offset = first_id; /* if still initializing, can't and don't need to realloc bitmaps */ if (!init_net.genl_sock) return 0; if (family->netnsok) { struct net *net; netlink_table_grab(); rcu_read_lock(); for_each_net_rcu(net) { err = __netlink_change_ngroups(net->genl_sock, mc_groups_longs * BITS_PER_LONG); if (err) { /* * No need to roll back, can only fail if * memory allocation fails and then the * number of _possible_ groups has been * increased on some sockets which is ok. */ break; } } rcu_read_unlock(); netlink_table_ungrab(); } else { err = netlink_change_ngroups(init_net.genl_sock, mc_groups_longs * BITS_PER_LONG); } if (groups_allocated && err) { for (i = 0; i < family->n_mcgrps; i++) clear_bit(family->mcgrp_offset + i, mc_groups); } return err; } static void genl_unregister_mc_groups(const struct genl_family *family) { struct net *net; int i; netlink_table_grab(); rcu_read_lock(); for_each_net_rcu(net) { for (i = 0; i < family->n_mcgrps; i++) __netlink_clear_multicast_users( net->genl_sock, family->mcgrp_offset + i); } rcu_read_unlock(); netlink_table_ungrab(); for (i = 0; i < family->n_mcgrps; i++) { int grp_id = family->mcgrp_offset + i; if (grp_id != 1) clear_bit(grp_id, mc_groups); genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, family, &family->mcgrps[i], grp_id); } } static bool genl_split_op_check(const struct genl_split_ops *op) { if (WARN_ON(hweight8(op->flags & (GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP)) != 1)) return true; return false; } static int genl_validate_ops(const struct genl_family *family) { struct genl_op_iter i, j; unsigned int s; if (WARN_ON(family->n_ops && !family->ops) || WARN_ON(family->n_small_ops && !family->small_ops) || WARN_ON(family->n_split_ops && !family->split_ops)) return -EINVAL; for (genl_op_iter_init(family, &i); genl_op_iter_next(&i); ) { if (!(i.flags & (GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP))) return -EINVAL; if (WARN_ON(i.cmd >= family->resv_start_op && (i.doit.validate || i.dumpit.validate))) return -EINVAL; genl_op_iter_copy(&j, &i); while (genl_op_iter_next(&j)) { if (i.cmd == j.cmd) return -EINVAL; } } if (family->n_split_ops) { if (genl_split_op_check(&family->split_ops[0])) return -EINVAL; } for (s = 1; s < family->n_split_ops; s++) { const struct genl_split_ops *a, *b; a = &family->split_ops[s - 1]; b = &family->split_ops[s]; if (genl_split_op_check(b)) return -EINVAL; /* Check sort order */ if (a->cmd < b->cmd) { continue; } else if (a->cmd > b->cmd) { WARN_ON(1); return -EINVAL; } if (a->internal_flags != b->internal_flags || ((a->flags ^ b->flags) & ~(GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP))) { WARN_ON(1); return -EINVAL; } if ((a->flags & GENL_CMD_CAP_DO) && (b->flags & GENL_CMD_CAP_DUMP)) continue; WARN_ON(1); return -EINVAL; } return 0; } static void *genl_sk_priv_alloc(struct genl_family *family) { void *priv; priv = kzalloc(family->sock_priv_size, GFP_KERNEL); if (!priv) return ERR_PTR(-ENOMEM); if (family->sock_priv_init) family->sock_priv_init(priv); return priv; } static void genl_sk_priv_free(const struct genl_family *family, void *priv) { if (family->sock_priv_destroy) family->sock_priv_destroy(priv); kfree(priv); } static int genl_sk_privs_alloc(struct genl_family *family) { if (!family->sock_priv_size) return 0; family->sock_privs = kzalloc(sizeof(*family->sock_privs), GFP_KERNEL); if (!family->sock_privs) return -ENOMEM; xa_init(family->sock_privs); return 0; } static void genl_sk_privs_free(const struct genl_family *family) { unsigned long id; void *priv; if (!family->sock_priv_size) return; xa_for_each(family->sock_privs, id, priv) genl_sk_priv_free(family, priv); xa_destroy(family->sock_privs); kfree(family->sock_privs); } static void genl_sk_priv_free_by_sock(struct genl_family *family, struct sock *sk) { void *priv; if (!family->sock_priv_size) return; priv = xa_erase(family->sock_privs, (unsigned long) sk); if (!priv) return; genl_sk_priv_free(family, priv); } static void genl_release(struct sock *sk, unsigned long *groups) { struct genl_family *family; unsigned int id; down_read(&cb_lock); idr_for_each_entry(&genl_fam_idr, family, id) genl_sk_priv_free_by_sock(family, sk); up_read(&cb_lock); } /** * __genl_sk_priv_get - Get family private pointer for socket, if exists * * @family: family * @sk: socket * * Lookup a private memory for a Generic netlink family and specified socket. * * Caller should make sure this is called in RCU read locked section. * * Return: valid pointer on success, otherwise negative error value * encoded by ERR_PTR(), NULL in case priv does not exist. */ void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk) { if (WARN_ON_ONCE(!family->sock_privs)) return ERR_PTR(-EINVAL); return xa_load(family->sock_privs, (unsigned long) sk); } /** * genl_sk_priv_get - Get family private pointer for socket * * @family: family * @sk: socket * * Lookup a private memory for a Generic netlink family and specified socket. * Allocate the private memory in case it was not already done. * * Return: valid pointer on success, otherwise negative error value * encoded by ERR_PTR(). */ void *genl_sk_priv_get(struct genl_family *family, struct sock *sk) { void *priv, *old_priv; priv = __genl_sk_priv_get(family, sk); if (priv) return priv; /* priv for the family does not exist so far, create it. */ priv = genl_sk_priv_alloc(family); if (IS_ERR(priv)) return ERR_CAST(priv); old_priv = xa_cmpxchg(family->sock_privs, (unsigned long) sk, NULL, priv, GFP_KERNEL); if (old_priv) { genl_sk_priv_free(family, priv); if (xa_is_err(old_priv)) return ERR_PTR(xa_err(old_priv)); /* Race happened, priv for the socket was already inserted. */ return old_priv; } return priv; } /** * genl_register_family - register a generic netlink family * @family: generic netlink family * * Registers the specified family after validating it first. Only one * family may be registered with the same family name or identifier. * * The family's ops, multicast groups and module pointer must already * be assigned. * * Return 0 on success or a negative error code. */ int genl_register_family(struct genl_family *family) { int err, i; int start = GENL_START_ALLOC, end = GENL_MAX_ID; err = genl_validate_ops(family); if (err) return err; genl_lock_all(); if (genl_family_find_byname(family->name)) { err = -EEXIST; goto errout_locked; } err = genl_sk_privs_alloc(family); if (err) goto errout_locked; /* * Sadly, a few cases need to be special-cased * due to them having previously abused the API * and having used their family ID also as their * multicast group ID, so we use reserved IDs * for both to be sure we can do that mapping. */ if (family == &genl_ctrl) { /* and this needs to be special for initial family lookups */ start = end = GENL_ID_CTRL; } else if (strcmp(family->name, "pmcraid") == 0) { start = end = GENL_ID_PMCRAID; } else if (strcmp(family->name, "VFS_DQUOT") == 0) { start = end = GENL_ID_VFS_DQUOT; } family->id = idr_alloc_cyclic(&genl_fam_idr, family, start, end + 1, GFP_KERNEL); if (family->id < 0) { err = family->id; goto errout_sk_privs_free; } err = genl_validate_assign_mc_groups(family); if (err) goto errout_remove; genl_unlock_all(); /* send all events */ genl_ctrl_event(CTRL_CMD_NEWFAMILY, family, NULL, 0); for (i = 0; i < family->n_mcgrps; i++) genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, family, &family->mcgrps[i], family->mcgrp_offset + i); return 0; errout_remove: idr_remove(&genl_fam_idr, family->id); errout_sk_privs_free: genl_sk_privs_free(family); errout_locked: genl_unlock_all(); return err; } EXPORT_SYMBOL(genl_register_family); /** * genl_unregister_family - unregister generic netlink family * @family: generic netlink family * * Unregisters the specified family. * * Returns 0 on success or a negative error code. */ int genl_unregister_family(const struct genl_family *family) { genl_lock_all(); if (!genl_family_find_byid(family->id)) { genl_unlock_all(); return -ENOENT; } genl_unregister_mc_groups(family); idr_remove(&genl_fam_idr, family->id); up_write(&cb_lock); wait_event(genl_sk_destructing_waitq, atomic_read(&genl_sk_destructing_cnt) == 0); genl_sk_privs_free(family); genl_unlock(); genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); return 0; } EXPORT_SYMBOL(genl_unregister_family); /** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message * @portid: netlink portid the message is addressed to * @seq: sequence number (usually the one of the sender) * @family: generic netlink family * @flags: netlink message flags * @cmd: generic netlink command * * Returns pointer to user specific header */ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, const struct genl_family *family, int flags, u8 cmd) { struct nlmsghdr *nlh; struct genlmsghdr *hdr; nlh = nlmsg_put(skb, portid, seq, family->id, GENL_HDRLEN + family->hdrsize, flags); if (nlh == NULL) return NULL; hdr = nlmsg_data(nlh); hdr->cmd = cmd; hdr->version = family->version; hdr->reserved = 0; return (char *) hdr + GENL_HDRLEN; } EXPORT_SYMBOL(genlmsg_put); static struct genl_dumpit_info *genl_dumpit_info_alloc(void) { return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL); } static void genl_dumpit_info_free(const struct genl_dumpit_info *info) { kfree(info); } static struct nlattr ** genl_family_rcv_msg_attrs_parse(const struct genl_family *family, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_split_ops *ops, int hdrlen, enum genl_validate_flags no_strict_flag) { enum netlink_validation validate = ops->validate & no_strict_flag ? NL_VALIDATE_LIBERAL : NL_VALIDATE_STRICT; struct nlattr **attrbuf; int err; if (!ops->maxattr) return NULL; attrbuf = kmalloc_array(ops->maxattr + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!attrbuf) return ERR_PTR(-ENOMEM); err = __nlmsg_parse(nlh, hdrlen, attrbuf, ops->maxattr, ops->policy, validate, extack); if (err) { kfree(attrbuf); return ERR_PTR(err); } return attrbuf; } static void genl_family_rcv_msg_attrs_free(struct nlattr **attrbuf) { kfree(attrbuf); } struct genl_start_context { const struct genl_family *family; struct nlmsghdr *nlh; struct netlink_ext_ack *extack; const struct genl_split_ops *ops; int hdrlen; }; static int genl_start(struct netlink_callback *cb) { struct genl_start_context *ctx = cb->data; const struct genl_split_ops *ops; struct genl_dumpit_info *info; struct nlattr **attrs = NULL; int rc = 0; ops = ctx->ops; if (!(ops->validate & GENL_DONT_VALIDATE_DUMP) && ctx->nlh->nlmsg_len < nlmsg_msg_size(ctx->hdrlen)) return -EINVAL; attrs = genl_family_rcv_msg_attrs_parse(ctx->family, ctx->nlh, ctx->extack, ops, ctx->hdrlen, GENL_DONT_VALIDATE_DUMP_STRICT); if (IS_ERR(attrs)) return PTR_ERR(attrs); info = genl_dumpit_info_alloc(); if (!info) { genl_family_rcv_msg_attrs_free(attrs); return -ENOMEM; } info->op = *ops; info->info.family = ctx->family; info->info.snd_seq = cb->nlh->nlmsg_seq; info->info.snd_portid = NETLINK_CB(cb->skb).portid; info->info.nlhdr = cb->nlh; info->info.genlhdr = nlmsg_data(cb->nlh); info->info.attrs = attrs; genl_info_net_set(&info->info, sock_net(cb->skb->sk)); info->info.extack = cb->extack; memset(&info->info.ctx, 0, sizeof(info->info.ctx)); cb->data = info; if (ops->start) { genl_op_lock(ctx->family); rc = ops->start(cb); genl_op_unlock(ctx->family); } if (rc) { genl_family_rcv_msg_attrs_free(info->info.attrs); genl_dumpit_info_free(info); cb->data = NULL; } return rc; } static int genl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct genl_dumpit_info *dump_info = cb->data; const struct genl_split_ops *ops = &dump_info->op; struct genl_info *info = &dump_info->info; int rc; info->extack = cb->extack; genl_op_lock(info->family); rc = ops->dumpit(skb, cb); genl_op_unlock(info->family); return rc; } static int genl_done(struct netlink_callback *cb) { struct genl_dumpit_info *dump_info = cb->data; const struct genl_split_ops *ops = &dump_info->op; struct genl_info *info = &dump_info->info; int rc = 0; info->extack = cb->extack; if (ops->done) { genl_op_lock(info->family); rc = ops->done(cb); genl_op_unlock(info->family); } genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(dump_info); return rc; } static int genl_family_rcv_msg_dumpit(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_split_ops *ops, int hdrlen, struct net *net) { struct genl_start_context ctx; struct netlink_dump_control c = { .module = family->module, .data = &ctx, .start = genl_start, .dump = genl_dumpit, .done = genl_done, .extack = extack, }; int err; ctx.family = family; ctx.nlh = nlh; ctx.extack = extack; ctx.ops = ops; ctx.hdrlen = hdrlen; genl_op_unlock(family); err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); genl_op_lock(family); return err; } static int genl_family_rcv_msg_doit(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_split_ops *ops, int hdrlen, struct net *net) { struct nlattr **attrbuf; struct genl_info info; int err; attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack, ops, hdrlen, GENL_DONT_VALIDATE_STRICT); if (IS_ERR(attrbuf)) return PTR_ERR(attrbuf); info.snd_seq = nlh->nlmsg_seq; info.snd_portid = NETLINK_CB(skb).portid; info.family = family; info.nlhdr = nlh; info.genlhdr = nlmsg_data(nlh); info.attrs = attrbuf; info.extack = extack; genl_info_net_set(&info, net); memset(&info.ctx, 0, sizeof(info.ctx)); if (ops->pre_doit) { err = ops->pre_doit(ops, skb, &info); if (err) goto out; } err = ops->doit(skb, &info); if (ops->post_doit) ops->post_doit(ops, skb, &info); out: genl_family_rcv_msg_attrs_free(attrbuf); return err; } static int genl_header_check(const struct genl_family *family, struct nlmsghdr *nlh, struct genlmsghdr *hdr, struct netlink_ext_ack *extack) { u16 flags; /* Only for commands added after we started validating */ if (hdr->cmd < family->resv_start_op) return 0; if (hdr->reserved) { NL_SET_ERR_MSG(extack, "genlmsghdr.reserved field is not 0"); return -EINVAL; } /* Old netlink flags have pretty loose semantics, allow only the flags * consumed by the core where we can enforce the meaning. */ flags = nlh->nlmsg_flags; if ((flags & NLM_F_DUMP) == NLM_F_DUMP) /* DUMP is 2 bits */ flags &= ~NLM_F_DUMP; if (flags & ~(NLM_F_REQUEST | NLM_F_ACK | NLM_F_ECHO)) { NL_SET_ERR_MSG(extack, "ambiguous or reserved bits set in nlmsg_flags"); return -EINVAL; } return 0; } static int genl_family_rcv_msg(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct genlmsghdr *hdr = nlmsg_data(nlh); struct genl_split_ops op; int hdrlen; u8 flags; /* this family doesn't exist in this netns */ if (!family->netnsok && !net_eq(net, &init_net)) return -ENOENT; hdrlen = GENL_HDRLEN + family->hdrsize; if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; if (genl_header_check(family, nlh, hdr, extack)) return -EINVAL; flags = (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP ? GENL_CMD_CAP_DUMP : GENL_CMD_CAP_DO; if (genl_get_cmd(hdr->cmd, flags, family, &op)) return -EOPNOTSUPP; if ((op.flags & GENL_ADMIN_PERM) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if ((op.flags & GENL_UNS_ADMIN_PERM) && !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (flags & GENL_CMD_CAP_DUMP) return genl_family_rcv_msg_dumpit(family, skb, nlh, extack, &op, hdrlen, net); else return genl_family_rcv_msg_doit(family, skb, nlh, extack, &op, hdrlen, net); } static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { const struct genl_family *family; int err; family = genl_family_find_byid(nlh->nlmsg_type); if (family == NULL) return -ENOENT; genl_op_lock(family); err = genl_family_rcv_msg(family, skb, nlh, extack); genl_op_unlock(family); return err; } static void genl_rcv(struct sk_buff *skb) { down_read(&cb_lock); netlink_rcv_skb(skb, &genl_rcv_msg); up_read(&cb_lock); } /************************************************************************** * Controller **************************************************************************/ static struct genl_family genl_ctrl; static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { struct genl_op_iter i; void *hdr; hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); if (hdr == NULL) return -EMSGSIZE; if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) || nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id) || nla_put_u32(skb, CTRL_ATTR_VERSION, family->version) || nla_put_u32(skb, CTRL_ATTR_HDRSIZE, family->hdrsize) || nla_put_u32(skb, CTRL_ATTR_MAXATTR, family->maxattr)) goto nla_put_failure; if (genl_op_iter_init(family, &i)) { struct nlattr *nla_ops; nla_ops = nla_nest_start_noflag(skb, CTRL_ATTR_OPS); if (nla_ops == NULL) goto nla_put_failure; while (genl_op_iter_next(&i)) { struct nlattr *nest; u32 op_flags; op_flags = i.flags; if (i.doit.policy || i.dumpit.policy) op_flags |= GENL_CMD_CAP_HASPOL; nest = nla_nest_start_noflag(skb, genl_op_iter_idx(&i)); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_OP_ID, i.cmd) || nla_put_u32(skb, CTRL_ATTR_OP_FLAGS, op_flags)) goto nla_put_failure; nla_nest_end(skb, nest); } nla_nest_end(skb, nla_ops); } if (family->n_mcgrps) { struct nlattr *nla_grps; int i; nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) goto nla_put_failure; for (i = 0; i < family->n_mcgrps; i++) { struct nlattr *nest; const struct genl_multicast_group *grp; grp = &family->mcgrps[i]; nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, family->mcgrp_offset + i) || nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME, grp->name)) goto nla_put_failure; nla_nest_end(skb, nest); } nla_nest_end(skb, nla_grps); } genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ctrl_fill_mcgrp_info(const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; struct nlattr *nla_grps; struct nlattr *nest; hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); if (hdr == NULL) return -1; if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) || nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id)) goto nla_put_failure; nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) goto nla_put_failure; nest = nla_nest_start_noflag(skb, 1); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, grp_id) || nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME, grp->name)) goto nla_put_failure; nla_nest_end(skb, nest); nla_nest_end(skb, nla_grps); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) { int n = 0; struct genl_family *rt; struct net *net = sock_net(skb->sk); int fams_to_skip = cb->args[0]; unsigned int id; int err = 0; idr_for_each_entry(&genl_fam_idr, rt, id) { if (!rt->netnsok && !net_eq(net, &init_net)) continue; if (n++ < fams_to_skip) continue; err = ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, CTRL_CMD_NEWFAMILY); if (err) { n--; break; } } cb->args[0] = n; return err; } static struct sk_buff *ctrl_build_family_msg(const struct genl_family *family, u32 portid, int seq, u8 cmd) { struct sk_buff *skb; int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb == NULL) return ERR_PTR(-ENOBUFS); err = ctrl_fill_info(family, portid, seq, 0, skb, cmd); if (err < 0) { nlmsg_free(skb); return ERR_PTR(err); } return skb; } static struct sk_buff * ctrl_build_mcgrp_msg(const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id, u32 portid, int seq, u8 cmd) { struct sk_buff *skb; int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb == NULL) return ERR_PTR(-ENOBUFS); err = ctrl_fill_mcgrp_info(family, grp, grp_id, portid, seq, 0, skb, cmd); if (err < 0) { nlmsg_free(skb); return ERR_PTR(err); } return skb; } static const struct nla_policy ctrl_policy_family[] = { [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING, .len = GENL_NAMSIZ - 1 }, }; static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; const struct genl_family *res = NULL; int err = -EINVAL; if (info->attrs[CTRL_ATTR_FAMILY_ID]) { u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]); res = genl_family_find_byid(id); err = -ENOENT; } if (info->attrs[CTRL_ATTR_FAMILY_NAME]) { char *name; name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]); res = genl_family_find_byname(name); #ifdef CONFIG_MODULES if (res == NULL) { genl_unlock(); up_read(&cb_lock); request_module("net-pf-%d-proto-%d-family-%s", PF_NETLINK, NETLINK_GENERIC, name); down_read(&cb_lock); genl_lock(); res = genl_family_find_byname(name); } #endif err = -ENOENT; } if (res == NULL) return err; if (!res->netnsok && !net_eq(genl_info_net(info), &init_net)) { /* family doesn't exist here */ return -ENOENT; } msg = ctrl_build_family_msg(res, info->snd_portid, info->snd_seq, CTRL_CMD_NEWFAMILY); if (IS_ERR(msg)) return PTR_ERR(msg); return genlmsg_reply(msg, info); } static int genl_ctrl_event(int event, const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id) { struct sk_buff *msg; /* genl is still initialising */ if (!init_net.genl_sock) return 0; switch (event) { case CTRL_CMD_NEWFAMILY: case CTRL_CMD_DELFAMILY: WARN_ON(grp); msg = ctrl_build_family_msg(family, 0, 0, event); break; case CTRL_CMD_NEWMCAST_GRP: case CTRL_CMD_DELMCAST_GRP: BUG_ON(!grp); msg = ctrl_build_mcgrp_msg(family, grp, grp_id, 0, 0, event); break; default: return -EINVAL; } if (IS_ERR(msg)) return PTR_ERR(msg); if (!family->netnsok) genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0, 0, GFP_KERNEL); else genlmsg_multicast_allns(&genl_ctrl, msg, 0, 0); return 0; } struct ctrl_dump_policy_ctx { struct netlink_policy_dump_state *state; const struct genl_family *rt; struct genl_op_iter *op_iter; u32 op; u16 fam_id; u8 dump_map:1, single_op:1; }; static const struct nla_policy ctrl_policy_policy[] = { [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING, .len = GENL_NAMSIZ - 1 }, [CTRL_ATTR_OP] = { .type = NLA_U32 }, }; static int ctrl_dumppolicy_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->info.attrs; const struct genl_family *rt; struct genl_op_iter i; int err; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME]) return -EINVAL; if (tb[CTRL_ATTR_FAMILY_ID]) { ctx->fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]); } else { rt = genl_family_find_byname( nla_data(tb[CTRL_ATTR_FAMILY_NAME])); if (!rt) return -ENOENT; ctx->fam_id = rt->id; } rt = genl_family_find_byid(ctx->fam_id); if (!rt) return -ENOENT; ctx->rt = rt; if (tb[CTRL_ATTR_OP]) { struct genl_split_ops doit, dump; ctx->single_op = true; ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]); err = genl_get_cmd_both(ctx->op, rt, &doit, &dump); if (err) { NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]); return err; } if (doit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, doit.policy, doit.maxattr); if (err) goto err_free_state; } if (dump.policy) { err = netlink_policy_dump_add_policy(&ctx->state, dump.policy, dump.maxattr); if (err) goto err_free_state; } if (!ctx->state) return -ENODATA; ctx->dump_map = 1; return 0; } ctx->op_iter = kmalloc(sizeof(*ctx->op_iter), GFP_KERNEL); if (!ctx->op_iter) return -ENOMEM; genl_op_iter_init(rt, ctx->op_iter); ctx->dump_map = genl_op_iter_next(ctx->op_iter); for (genl_op_iter_init(rt, &i); genl_op_iter_next(&i); ) { if (i.doit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, i.doit.policy, i.doit.maxattr); if (err) goto err_free_state; } if (i.dumpit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, i.dumpit.policy, i.dumpit.maxattr); if (err) goto err_free_state; } } if (!ctx->state) { err = -ENODATA; goto err_free_op_iter; } return 0; err_free_state: netlink_policy_dump_free(ctx->state); err_free_op_iter: kfree(ctx->op_iter); return err; } static void *ctrl_dumppolicy_prep(struct sk_buff *skb, struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &genl_ctrl, NLM_F_MULTI, CTRL_CMD_GETPOLICY); if (!hdr) return NULL; if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, ctx->fam_id)) return NULL; return hdr; } static int ctrl_dumppolicy_put_op(struct sk_buff *skb, struct netlink_callback *cb, struct genl_split_ops *doit, struct genl_split_ops *dumpit) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr *nest_pol, *nest_op; void *hdr; int idx; /* skip if we have nothing to show */ if (!doit->policy && !dumpit->policy) return 0; hdr = ctrl_dumppolicy_prep(skb, cb); if (!hdr) return -ENOBUFS; nest_pol = nla_nest_start(skb, CTRL_ATTR_OP_POLICY); if (!nest_pol) goto err; nest_op = nla_nest_start(skb, doit->cmd); if (!nest_op) goto err; if (doit->policy) { idx = netlink_policy_dump_get_policy_idx(ctx->state, doit->policy, doit->maxattr); if (nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx)) goto err; } if (dumpit->policy) { idx = netlink_policy_dump_get_policy_idx(ctx->state, dumpit->policy, dumpit->maxattr); if (nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx)) goto err; } nla_nest_end(skb, nest_op); nla_nest_end(skb, nest_pol); genlmsg_end(skb, hdr); return 0; err: genlmsg_cancel(skb, hdr); return -ENOBUFS; } static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; void *hdr; if (ctx->dump_map) { if (ctx->single_op) { struct genl_split_ops doit, dumpit; if (WARN_ON(genl_get_cmd_both(ctx->op, ctx->rt, &doit, &dumpit))) return -ENOENT; if (ctrl_dumppolicy_put_op(skb, cb, &doit, &dumpit)) return skb->len; /* done with the per-op policy index list */ ctx->dump_map = 0; } while (ctx->dump_map) { if (ctrl_dumppolicy_put_op(skb, cb, &ctx->op_iter->doit, &ctx->op_iter->dumpit)) return skb->len; ctx->dump_map = genl_op_iter_next(ctx->op_iter); } } while (netlink_policy_dump_loop(ctx->state)) { struct nlattr *nest; hdr = ctrl_dumppolicy_prep(skb, cb); if (!hdr) goto nla_put_failure; nest = nla_nest_start(skb, CTRL_ATTR_POLICY); if (!nest) goto nla_put_failure; if (netlink_policy_dump_write(skb, ctx->state)) goto nla_put_failure; nla_nest_end(skb, nest); genlmsg_end(skb, hdr); } return skb->len; nla_put_failure: genlmsg_cancel(skb, hdr); return skb->len; } static int ctrl_dumppolicy_done(struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; kfree(ctx->op_iter); netlink_policy_dump_free(ctx->state); return 0; } static const struct genl_split_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, .validate = GENL_DONT_VALIDATE_STRICT, .policy = ctrl_policy_family, .maxattr = ARRAY_SIZE(ctrl_policy_family) - 1, .doit = ctrl_getfamily, .flags = GENL_CMD_CAP_DO, }, { .cmd = CTRL_CMD_GETFAMILY, .validate = GENL_DONT_VALIDATE_DUMP, .policy = ctrl_policy_family, .maxattr = ARRAY_SIZE(ctrl_policy_family) - 1, .dumpit = ctrl_dumpfamily, .flags = GENL_CMD_CAP_DUMP, }, { .cmd = CTRL_CMD_GETPOLICY, .policy = ctrl_policy_policy, .maxattr = ARRAY_SIZE(ctrl_policy_policy) - 1, .start = ctrl_dumppolicy_start, .dumpit = ctrl_dumppolicy, .done = ctrl_dumppolicy_done, .flags = GENL_CMD_CAP_DUMP, }, }; static const struct genl_multicast_group genl_ctrl_groups[] = { { .name = "notify", }, }; static struct genl_family genl_ctrl __ro_after_init = { .module = THIS_MODULE, .split_ops = genl_ctrl_ops, .n_split_ops = ARRAY_SIZE(genl_ctrl_ops), .resv_start_op = CTRL_CMD_GETPOLICY + 1, .mcgrps = genl_ctrl_groups, .n_mcgrps = ARRAY_SIZE(genl_ctrl_groups), .id = GENL_ID_CTRL, .name = "nlctrl", .version = 0x2, .netnsok = true, }; static int genl_bind(struct net *net, int group) { const struct genl_family *family; unsigned int id; int ret = 0; down_read(&cb_lock); idr_for_each_entry(&genl_fam_idr, family, id) { const struct genl_multicast_group *grp; int i; if (family->n_mcgrps == 0) continue; i = group - family->mcgrp_offset; if (i < 0 || i >= family->n_mcgrps) continue; grp = &family->mcgrps[i]; if ((grp->flags & GENL_MCAST_CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) ret = -EPERM; if ((grp->flags & GENL_MCAST_CAP_SYS_ADMIN) && !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; if (ret) break; if (family->bind) family->bind(i); break; } up_read(&cb_lock); return ret; } static void genl_unbind(struct net *net, int group) { const struct genl_family *family; unsigned int id; down_read(&cb_lock); idr_for_each_entry(&genl_fam_idr, family, id) { int i; if (family->n_mcgrps == 0) continue; i = group - family->mcgrp_offset; if (i < 0 || i >= family->n_mcgrps) continue; if (family->unbind) family->unbind(i); break; } up_read(&cb_lock); } static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, .bind = genl_bind, .unbind = genl_unbind, .release = genl_release, }; /* we'll bump the group number right afterwards */ net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg); if (!net->genl_sock && net_eq(net, &init_net)) panic("GENL: Cannot initialize generic netlink\n"); if (!net->genl_sock) return -ENOMEM; return 0; } static void __net_exit genl_pernet_exit(struct net *net) { netlink_kernel_release(net->genl_sock); net->genl_sock = NULL; } static struct pernet_operations genl_pernet_ops = { .init = genl_pernet_init, .exit = genl_pernet_exit, }; static int __init genl_init(void) { int err; err = genl_register_family(&genl_ctrl); if (err < 0) goto problem; err = register_pernet_subsys(&genl_pernet_ops); if (err) goto problem; return 0; problem: panic("GENL: Cannot register controller: %d\n", err); } core_initcall(genl_init); static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group) { struct sk_buff *tmp; struct net *net, *prev = NULL; bool delivered = false; int err; rcu_read_lock(); for_each_net_rcu(net) { if (prev) { tmp = skb_clone(skb, GFP_ATOMIC); if (!tmp) { err = -ENOMEM; goto error; } err = nlmsg_multicast(prev->genl_sock, tmp, portid, group, GFP_ATOMIC); if (!err) delivered = true; else if (err != -ESRCH) goto error; } prev = net; } err = nlmsg_multicast(prev->genl_sock, skb, portid, group, GFP_ATOMIC); rcu_read_unlock(); if (!err) delivered = true; else if (err != -ESRCH) return err; return delivered ? 0 : -ESRCH; error: rcu_read_unlock(); kfree_skb(skb); return err; } int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group); } EXPORT_SYMBOL(genlmsg_multicast_allns); void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags) { struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; if (WARN_ON_ONCE(group >= family->n_mcgrps)) return; group = family->mcgrp_offset + group; nlmsg_notify(sk, skb, info->snd_portid, group, nlmsg_report(info->nlhdr), flags); } EXPORT_SYMBOL(genl_notify);
40 40 1 40 40 40 40 40 1 2 1 5 5 3 2 17 17 24 24 24 24 18 18 18 18 1 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_skbprio.c SKB Priority Queue. * * Authors: Nishanth Devarajan, <ndev2021@gmail.com> * Cody Doucette, <doucette@bu.edu> * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu */ #include <linux/string.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/sch_generic.h> #include <net/inet_ecn.h> /* SKB Priority Queue * ================================= * * Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes * packets according to their skb->priority field. Under congestion, * Skbprio drops already-enqueued lower priority packets to make space * available for higher priority packets; it was conceived as a solution * for denial-of-service defenses that need to route packets with different * priorities as a mean to overcome DoS attacks. */ struct skbprio_sched_data { /* Queue state. */ struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY]; struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY]; u16 highest_prio; u16 lowest_prio; }; static u16 calc_new_high_prio(const struct skbprio_sched_data *q) { int prio; for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) { if (!skb_queue_empty(&q->qdiscs[prio])) return prio; } /* SKB queue is empty, return 0 (default highest priority setting). */ return 0; } static u16 calc_new_low_prio(const struct skbprio_sched_data *q) { int prio; for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) { if (!skb_queue_empty(&q->qdiscs[prio])) return prio; } /* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1 * (default lowest priority setting). */ return SKBPRIO_MAX_PRIORITY - 1; } static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1; struct skbprio_sched_data *q = qdisc_priv(sch); struct sk_buff_head *qdisc; struct sk_buff_head *lp_qdisc; struct sk_buff *to_drop; u16 prio, lp; /* Obtain the priority of @skb. */ prio = min(skb->priority, max_priority); qdisc = &q->qdiscs[prio]; /* sch->limit can change under us from skbprio_change() */ if (sch->q.qlen < READ_ONCE(sch->limit)) { __skb_queue_tail(qdisc, skb); qdisc_qstats_backlog_inc(sch, skb); q->qstats[prio].backlog += qdisc_pkt_len(skb); /* Check to update highest and lowest priorities. */ if (prio > q->highest_prio) q->highest_prio = prio; if (prio < q->lowest_prio) q->lowest_prio = prio; sch->q.qlen++; return NET_XMIT_SUCCESS; } /* If this packet has the lowest priority, drop it. */ lp = q->lowest_prio; if (prio <= lp) { q->qstats[prio].drops++; q->qstats[prio].overlimits++; return qdisc_drop(skb, sch, to_free); } __skb_queue_tail(qdisc, skb); qdisc_qstats_backlog_inc(sch, skb); q->qstats[prio].backlog += qdisc_pkt_len(skb); /* Drop the packet at the tail of the lowest priority qdisc. */ lp_qdisc = &q->qdiscs[lp]; to_drop = __skb_dequeue_tail(lp_qdisc); BUG_ON(!to_drop); qdisc_qstats_backlog_dec(sch, to_drop); qdisc_drop(to_drop, sch, to_free); q->qstats[lp].backlog -= qdisc_pkt_len(to_drop); q->qstats[lp].drops++; q->qstats[lp].overlimits++; /* Check to update highest and lowest priorities. */ if (skb_queue_empty(lp_qdisc)) { if (q->lowest_prio == q->highest_prio) { q->lowest_prio = prio; q->highest_prio = prio; } else { q->lowest_prio = calc_new_low_prio(q); } } if (prio > q->highest_prio) q->highest_prio = prio; return NET_XMIT_CN; } static struct sk_buff *skbprio_dequeue(struct Qdisc *sch) { struct skbprio_sched_data *q = qdisc_priv(sch); struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio]; struct sk_buff *skb = __skb_dequeue(hpq); if (unlikely(!skb)) return NULL; sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb); /* Update highest priority field. */ if (skb_queue_empty(hpq)) { if (q->lowest_prio == q->highest_prio) { q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; } else { q->highest_prio = calc_new_high_prio(q); } } return skb; } static int skbprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct tc_skbprio_qopt *ctl = nla_data(opt); if (opt->nla_len != nla_attr_size(sizeof(*ctl))) return -EINVAL; WRITE_ONCE(sch->limit, ctl->limit); return 0; } static int skbprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct skbprio_sched_data *q = qdisc_priv(sch); int prio; /* Initialise all queues, one for each possible priority. */ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) __skb_queue_head_init(&q->qdiscs[prio]); memset(&q->qstats, 0, sizeof(q->qstats)); q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; sch->limit = 64; if (!opt) return 0; return skbprio_change(sch, opt, extack); } static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tc_skbprio_qopt opt; opt.limit = READ_ONCE(sch->limit); if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) return -1; return skb->len; } static void skbprio_reset(struct Qdisc *sch) { struct skbprio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) __skb_queue_purge(&q->qdiscs[prio]); memset(&q->qstats, 0, sizeof(q->qstats)); q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; } static void skbprio_destroy(struct Qdisc *sch) { struct skbprio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) __skb_queue_purge(&q->qdiscs[prio]); } static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } static unsigned long skbprio_find(struct Qdisc *sch, u32 classid) { return 0; } static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct skbprio_sched_data *q = qdisc_priv(sch); if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1], q->qstats[cl - 1].qlen) < 0) return -1; return 0; } static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { unsigned int i; if (arg->stop) return; for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) { if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; } } static const struct Qdisc_class_ops skbprio_class_ops = { .leaf = skbprio_leaf, .find = skbprio_find, .dump = skbprio_dump_class, .dump_stats = skbprio_dump_class_stats, .walk = skbprio_walk, }; static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = { .cl_ops = &skbprio_class_ops, .id = "skbprio", .priv_size = sizeof(struct skbprio_sched_data), .enqueue = skbprio_enqueue, .dequeue = skbprio_dequeue, .peek = qdisc_peek_dequeued, .init = skbprio_init, .reset = skbprio_reset, .change = skbprio_change, .dump = skbprio_dump, .destroy = skbprio_destroy, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("skbprio"); static int __init skbprio_module_init(void) { return register_qdisc(&skbprio_qdisc_ops); } static void __exit skbprio_module_exit(void) { unregister_qdisc(&skbprio_qdisc_ops); } module_init(skbprio_module_init) module_exit(skbprio_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SKB priority based scheduling qdisc");
438 515 1 4 1 1 2 7 502 23 34 220 217 1 2 3 2 2 215 12 86 195 3 1 3 187 10 189 190 186 188 1 1 178 2 175 8 1 2 8 2 177 4 1 3 2 172 3 174 170 166 169 166 164 164 163 162 160 157 152 151 145 146 141 138 2 4 136 134 133 131 202 2 139 62 638 703 702 1 703 118 2 1 2 2 3 1 1 705 16 702 2 2 705 703 705 2 3 4 6 2 8 3 5 3 3 3 5 2 1 2 1 686 104 85 34 3 3 13 5 10 1 13 2 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/bond/bond_netlink.c - Netlink interface for bonding * Copyright (c) 2013 Jiri Pirko <jiri@resnulli.us> * Copyright (c) 2013 Scott Feldman <sfeldma@cumulusnetworks.com> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_link.h> #include <linux/if_ether.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include <net/bonding.h> #include <net/ipv6.h> static size_t bond_get_slave_size(const struct net_device *bond_dev, const struct net_device *slave_dev) { return nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_STATE */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_MII_STATUS */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_SLAVE_LINK_FAILURE_COUNT */ nla_total_size(MAX_ADDR_LEN) + /* IFLA_BOND_SLAVE_PERM_HWADDR */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_QUEUE_ID */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_AGGREGATOR_ID */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE */ nla_total_size(sizeof(s32)) + /* IFLA_BOND_SLAVE_PRIO */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_ACTOR_PORT_PRIO */ 0; } static int bond_fill_slave_info(struct sk_buff *skb, const struct net_device *bond_dev, const struct net_device *slave_dev) { struct slave *slave = bond_slave_get_rtnl(slave_dev); if (nla_put_u8(skb, IFLA_BOND_SLAVE_STATE, bond_slave_state(slave))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_SLAVE_MII_STATUS, slave->link)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_SLAVE_LINK_FAILURE_COUNT, slave->link_failure_count)) goto nla_put_failure; if (nla_put(skb, IFLA_BOND_SLAVE_PERM_HWADDR, slave_dev->addr_len, slave->perm_hwaddr)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_SLAVE_QUEUE_ID, READ_ONCE(slave->queue_id))) goto nla_put_failure; if (nla_put_s32(skb, IFLA_BOND_SLAVE_PRIO, slave->prio)) goto nla_put_failure; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { const struct aggregator *agg; const struct port *ad_port; ad_port = &SLAVE_AD_INFO(slave)->port; agg = SLAVE_AD_INFO(slave)->port.aggregator; if (agg) { if (nla_put_u16(skb, IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, agg->aggregator_identifier)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, ad_port->actor_oper_port_state)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, ad_port->partner_oper.port_state)) goto nla_put_failure; } if (nla_put_u16(skb, IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, SLAVE_AD_INFO(slave)->port_priority)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } /* Limit the max delay range to 300s */ static const struct netlink_range_validation delay_range = { .max = 300000, }; static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = { [IFLA_BOND_MODE] = { .type = NLA_U8 }, [IFLA_BOND_ACTIVE_SLAVE] = { .type = NLA_U32 }, [IFLA_BOND_MIIMON] = { .type = NLA_U32 }, [IFLA_BOND_UPDELAY] = { .type = NLA_U32 }, [IFLA_BOND_DOWNDELAY] = { .type = NLA_U32 }, [IFLA_BOND_USE_CARRIER] = { .type = NLA_U8 }, [IFLA_BOND_ARP_INTERVAL] = { .type = NLA_U32 }, [IFLA_BOND_ARP_IP_TARGET] = { .type = NLA_NESTED }, [IFLA_BOND_ARP_VALIDATE] = { .type = NLA_U32 }, [IFLA_BOND_ARP_ALL_TARGETS] = { .type = NLA_U32 }, [IFLA_BOND_PRIMARY] = { .type = NLA_U32 }, [IFLA_BOND_PRIMARY_RESELECT] = { .type = NLA_U8 }, [IFLA_BOND_FAIL_OVER_MAC] = { .type = NLA_U8 }, [IFLA_BOND_XMIT_HASH_POLICY] = { .type = NLA_U8 }, [IFLA_BOND_RESEND_IGMP] = { .type = NLA_U32 }, [IFLA_BOND_NUM_PEER_NOTIF] = { .type = NLA_U8 }, [IFLA_BOND_ALL_SLAVES_ACTIVE] = { .type = NLA_U8 }, [IFLA_BOND_MIN_LINKS] = { .type = NLA_U32 }, [IFLA_BOND_LP_INTERVAL] = { .type = NLA_U32 }, [IFLA_BOND_PACKETS_PER_SLAVE] = { .type = NLA_U32 }, [IFLA_BOND_AD_LACP_ACTIVE] = { .type = NLA_U8 }, [IFLA_BOND_AD_LACP_RATE] = { .type = NLA_U8 }, [IFLA_BOND_AD_SELECT] = { .type = NLA_U8 }, [IFLA_BOND_AD_INFO] = { .type = NLA_NESTED }, [IFLA_BOND_AD_ACTOR_SYS_PRIO] = { .type = NLA_U16 }, [IFLA_BOND_AD_USER_PORT_KEY] = { .type = NLA_U16 }, [IFLA_BOND_AD_ACTOR_SYSTEM] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IFLA_BOND_TLB_DYNAMIC_LB] = { .type = NLA_U8 }, [IFLA_BOND_PEER_NOTIF_DELAY] = NLA_POLICY_FULL_RANGE(NLA_U32, &delay_range), [IFLA_BOND_MISSED_MAX] = { .type = NLA_U8 }, [IFLA_BOND_NS_IP6_TARGET] = { .type = NLA_NESTED }, [IFLA_BOND_COUPLED_CONTROL] = { .type = NLA_U8 }, [IFLA_BOND_BROADCAST_NEIGH] = { .type = NLA_U8 }, }; static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = { [IFLA_BOND_SLAVE_QUEUE_ID] = { .type = NLA_U16 }, [IFLA_BOND_SLAVE_PRIO] = { .type = NLA_S32 }, [IFLA_BOND_SLAVE_ACTOR_PORT_PRIO] = { .type = NLA_U16 }, }; static int bond_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } return 0; } static int bond_slave_changelink(struct net_device *bond_dev, struct net_device *slave_dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(bond_dev); struct bond_opt_value newval; int err; if (!data) return 0; if (data[IFLA_BOND_SLAVE_QUEUE_ID]) { u16 queue_id = nla_get_u16(data[IFLA_BOND_SLAVE_QUEUE_ID]); char queue_id_str[IFNAMSIZ + 7]; /* queue_id option setting expects slave_name:queue_id */ snprintf(queue_id_str, sizeof(queue_id_str), "%s:%u\n", slave_dev->name, queue_id); bond_opt_initstr(&newval, queue_id_str); err = __bond_opt_set(bond, BOND_OPT_QUEUE_ID, &newval, data[IFLA_BOND_SLAVE_QUEUE_ID], extack); if (err) return err; } if (data[IFLA_BOND_SLAVE_PRIO]) { int prio = nla_get_s32(data[IFLA_BOND_SLAVE_PRIO]); bond_opt_slave_initval(&newval, &slave_dev, prio); err = __bond_opt_set(bond, BOND_OPT_PRIO, &newval, data[IFLA_BOND_SLAVE_PRIO], extack); if (err) return err; } if (data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO]) { u16 ad_prio = nla_get_u16(data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO]); bond_opt_slave_initval(&newval, &slave_dev, ad_prio); err = __bond_opt_set(bond, BOND_OPT_ACTOR_PORT_PRIO, &newval, data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO], extack); if (err) return err; } return 0; } static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(bond_dev); struct bond_opt_value newval; int miimon = 0; int err; if (!data) return 0; if (data[IFLA_BOND_MODE]) { int mode = nla_get_u8(data[IFLA_BOND_MODE]); bond_opt_initval(&newval, mode); err = __bond_opt_set(bond, BOND_OPT_MODE, &newval, data[IFLA_BOND_MODE], extack); if (err) return err; } if (data[IFLA_BOND_ACTIVE_SLAVE]) { int ifindex = nla_get_u32(data[IFLA_BOND_ACTIVE_SLAVE]); struct net_device *slave_dev; char *active_slave = ""; if (ifindex != 0) { slave_dev = __dev_get_by_index(dev_net(bond_dev), ifindex); if (!slave_dev) return -ENODEV; active_slave = slave_dev->name; } bond_opt_initstr(&newval, active_slave); err = __bond_opt_set(bond, BOND_OPT_ACTIVE_SLAVE, &newval, data[IFLA_BOND_ACTIVE_SLAVE], extack); if (err) return err; } if (data[IFLA_BOND_MIIMON]) { miimon = nla_get_u32(data[IFLA_BOND_MIIMON]); bond_opt_initval(&newval, miimon); err = __bond_opt_set(bond, BOND_OPT_MIIMON, &newval, data[IFLA_BOND_MIIMON], extack); if (err) return err; } if (data[IFLA_BOND_UPDELAY]) { int updelay = nla_get_u32(data[IFLA_BOND_UPDELAY]); bond_opt_initval(&newval, updelay); err = __bond_opt_set(bond, BOND_OPT_UPDELAY, &newval, data[IFLA_BOND_UPDELAY], extack); if (err) return err; } if (data[IFLA_BOND_DOWNDELAY]) { int downdelay = nla_get_u32(data[IFLA_BOND_DOWNDELAY]); bond_opt_initval(&newval, downdelay); err = __bond_opt_set(bond, BOND_OPT_DOWNDELAY, &newval, data[IFLA_BOND_DOWNDELAY], extack); if (err) return err; } if (data[IFLA_BOND_PEER_NOTIF_DELAY]) { int delay = nla_get_u32(data[IFLA_BOND_PEER_NOTIF_DELAY]); bond_opt_initval(&newval, delay); err = __bond_opt_set(bond, BOND_OPT_PEER_NOTIF_DELAY, &newval, data[IFLA_BOND_PEER_NOTIF_DELAY], extack); if (err) return err; } if (data[IFLA_BOND_USE_CARRIER]) { if (nla_get_u8(data[IFLA_BOND_USE_CARRIER]) != 1) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_BOND_USE_CARRIER], "option obsolete, use_carrier cannot be disabled"); return -EINVAL; } } if (data[IFLA_BOND_ARP_INTERVAL]) { int arp_interval = nla_get_u32(data[IFLA_BOND_ARP_INTERVAL]); if (arp_interval && miimon) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_BOND_ARP_INTERVAL], "ARP monitoring cannot be used with MII monitoring"); return -EINVAL; } bond_opt_initval(&newval, arp_interval); err = __bond_opt_set(bond, BOND_OPT_ARP_INTERVAL, &newval, data[IFLA_BOND_ARP_INTERVAL], extack); if (err) return err; } if (data[IFLA_BOND_ARP_IP_TARGET]) { struct nlattr *attr; int i = 0, rem; bond_option_arp_ip_targets_clear(bond); nla_for_each_nested(attr, data[IFLA_BOND_ARP_IP_TARGET], rem) { __be32 target; if (nla_len(attr) < sizeof(target)) return -EINVAL; target = nla_get_be32(attr); bond_opt_initval(&newval, (__force u64)target); err = __bond_opt_set(bond, BOND_OPT_ARP_TARGETS, &newval, data[IFLA_BOND_ARP_IP_TARGET], extack); if (err) break; i++; } if (i == 0 && bond->params.arp_interval) netdev_warn(bond->dev, "Removing last arp target with arp_interval on\n"); if (err) return err; } #if IS_ENABLED(CONFIG_IPV6) if (data[IFLA_BOND_NS_IP6_TARGET]) { struct nlattr *attr; int i = 0, rem; bond_option_ns_ip6_targets_clear(bond); nla_for_each_nested(attr, data[IFLA_BOND_NS_IP6_TARGET], rem) { struct in6_addr addr6; if (nla_len(attr) < sizeof(addr6)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address"); return -EINVAL; } addr6 = nla_get_in6_addr(attr); bond_opt_initextra(&newval, &addr6, sizeof(addr6)); err = __bond_opt_set(bond, BOND_OPT_NS_TARGETS, &newval, data[IFLA_BOND_NS_IP6_TARGET], extack); if (err) break; i++; } if (i == 0 && bond->params.arp_interval) netdev_warn(bond->dev, "Removing last ns target with arp_interval on\n"); if (err) return err; } #endif if (data[IFLA_BOND_ARP_VALIDATE]) { int arp_validate = nla_get_u32(data[IFLA_BOND_ARP_VALIDATE]); if (arp_validate && miimon) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_BOND_ARP_INTERVAL], "ARP validating cannot be used with MII monitoring"); return -EINVAL; } bond_opt_initval(&newval, arp_validate); err = __bond_opt_set(bond, BOND_OPT_ARP_VALIDATE, &newval, data[IFLA_BOND_ARP_VALIDATE], extack); if (err) return err; } if (data[IFLA_BOND_ARP_ALL_TARGETS]) { int arp_all_targets = nla_get_u32(data[IFLA_BOND_ARP_ALL_TARGETS]); bond_opt_initval(&newval, arp_all_targets); err = __bond_opt_set(bond, BOND_OPT_ARP_ALL_TARGETS, &newval, data[IFLA_BOND_ARP_ALL_TARGETS], extack); if (err) return err; } if (data[IFLA_BOND_PRIMARY]) { int ifindex = nla_get_u32(data[IFLA_BOND_PRIMARY]); struct net_device *dev; char *primary = ""; dev = __dev_get_by_index(dev_net(bond_dev), ifindex); if (dev) primary = dev->name; bond_opt_initstr(&newval, primary); err = __bond_opt_set(bond, BOND_OPT_PRIMARY, &newval, data[IFLA_BOND_PRIMARY], extack); if (err) return err; } if (data[IFLA_BOND_PRIMARY_RESELECT]) { int primary_reselect = nla_get_u8(data[IFLA_BOND_PRIMARY_RESELECT]); bond_opt_initval(&newval, primary_reselect); err = __bond_opt_set(bond, BOND_OPT_PRIMARY_RESELECT, &newval, data[IFLA_BOND_PRIMARY_RESELECT], extack); if (err) return err; } if (data[IFLA_BOND_FAIL_OVER_MAC]) { int fail_over_mac = nla_get_u8(data[IFLA_BOND_FAIL_OVER_MAC]); bond_opt_initval(&newval, fail_over_mac); err = __bond_opt_set(bond, BOND_OPT_FAIL_OVER_MAC, &newval, data[IFLA_BOND_FAIL_OVER_MAC], extack); if (err) return err; } if (data[IFLA_BOND_XMIT_HASH_POLICY]) { int xmit_hash_policy = nla_get_u8(data[IFLA_BOND_XMIT_HASH_POLICY]); bond_opt_initval(&newval, xmit_hash_policy); err = __bond_opt_set(bond, BOND_OPT_XMIT_HASH, &newval, data[IFLA_BOND_XMIT_HASH_POLICY], extack); if (err) return err; } if (data[IFLA_BOND_RESEND_IGMP]) { int resend_igmp = nla_get_u32(data[IFLA_BOND_RESEND_IGMP]); bond_opt_initval(&newval, resend_igmp); err = __bond_opt_set(bond, BOND_OPT_RESEND_IGMP, &newval, data[IFLA_BOND_RESEND_IGMP], extack); if (err) return err; } if (data[IFLA_BOND_NUM_PEER_NOTIF]) { int num_peer_notif = nla_get_u8(data[IFLA_BOND_NUM_PEER_NOTIF]); bond_opt_initval(&newval, num_peer_notif); err = __bond_opt_set(bond, BOND_OPT_NUM_PEER_NOTIF, &newval, data[IFLA_BOND_NUM_PEER_NOTIF], extack); if (err) return err; } if (data[IFLA_BOND_ALL_SLAVES_ACTIVE]) { int all_slaves_active = nla_get_u8(data[IFLA_BOND_ALL_SLAVES_ACTIVE]); bond_opt_initval(&newval, all_slaves_active); err = __bond_opt_set(bond, BOND_OPT_ALL_SLAVES_ACTIVE, &newval, data[IFLA_BOND_ALL_SLAVES_ACTIVE], extack); if (err) return err; } if (data[IFLA_BOND_MIN_LINKS]) { int min_links = nla_get_u32(data[IFLA_BOND_MIN_LINKS]); bond_opt_initval(&newval, min_links); err = __bond_opt_set(bond, BOND_OPT_MINLINKS, &newval, data[IFLA_BOND_MIN_LINKS], extack); if (err) return err; } if (data[IFLA_BOND_LP_INTERVAL]) { int lp_interval = nla_get_u32(data[IFLA_BOND_LP_INTERVAL]); bond_opt_initval(&newval, lp_interval); err = __bond_opt_set(bond, BOND_OPT_LP_INTERVAL, &newval, data[IFLA_BOND_LP_INTERVAL], extack); if (err) return err; } if (data[IFLA_BOND_PACKETS_PER_SLAVE]) { int packets_per_slave = nla_get_u32(data[IFLA_BOND_PACKETS_PER_SLAVE]); bond_opt_initval(&newval, packets_per_slave); err = __bond_opt_set(bond, BOND_OPT_PACKETS_PER_SLAVE, &newval, data[IFLA_BOND_PACKETS_PER_SLAVE], extack); if (err) return err; } if (data[IFLA_BOND_AD_LACP_ACTIVE]) { int lacp_active = nla_get_u8(data[IFLA_BOND_AD_LACP_ACTIVE]); bond_opt_initval(&newval, lacp_active); err = __bond_opt_set(bond, BOND_OPT_LACP_ACTIVE, &newval, data[IFLA_BOND_AD_LACP_ACTIVE], extack); if (err) return err; } if (data[IFLA_BOND_AD_LACP_RATE]) { int lacp_rate = nla_get_u8(data[IFLA_BOND_AD_LACP_RATE]); bond_opt_initval(&newval, lacp_rate); err = __bond_opt_set(bond, BOND_OPT_LACP_RATE, &newval, data[IFLA_BOND_AD_LACP_RATE], extack); if (err) return err; } if (data[IFLA_BOND_AD_SELECT]) { int ad_select = nla_get_u8(data[IFLA_BOND_AD_SELECT]); bond_opt_initval(&newval, ad_select); err = __bond_opt_set(bond, BOND_OPT_AD_SELECT, &newval, data[IFLA_BOND_AD_SELECT], extack); if (err) return err; } if (data[IFLA_BOND_AD_ACTOR_SYS_PRIO]) { int actor_sys_prio = nla_get_u16(data[IFLA_BOND_AD_ACTOR_SYS_PRIO]); bond_opt_initval(&newval, actor_sys_prio); err = __bond_opt_set(bond, BOND_OPT_AD_ACTOR_SYS_PRIO, &newval, data[IFLA_BOND_AD_ACTOR_SYS_PRIO], extack); if (err) return err; } if (data[IFLA_BOND_AD_USER_PORT_KEY]) { int port_key = nla_get_u16(data[IFLA_BOND_AD_USER_PORT_KEY]); bond_opt_initval(&newval, port_key); err = __bond_opt_set(bond, BOND_OPT_AD_USER_PORT_KEY, &newval, data[IFLA_BOND_AD_USER_PORT_KEY], extack); if (err) return err; } if (data[IFLA_BOND_AD_ACTOR_SYSTEM]) { if (nla_len(data[IFLA_BOND_AD_ACTOR_SYSTEM]) != ETH_ALEN) return -EINVAL; bond_opt_initval(&newval, nla_get_u64(data[IFLA_BOND_AD_ACTOR_SYSTEM])); err = __bond_opt_set(bond, BOND_OPT_AD_ACTOR_SYSTEM, &newval, data[IFLA_BOND_AD_ACTOR_SYSTEM], extack); if (err) return err; } if (data[IFLA_BOND_TLB_DYNAMIC_LB]) { int dynamic_lb = nla_get_u8(data[IFLA_BOND_TLB_DYNAMIC_LB]); bond_opt_initval(&newval, dynamic_lb); err = __bond_opt_set(bond, BOND_OPT_TLB_DYNAMIC_LB, &newval, data[IFLA_BOND_TLB_DYNAMIC_LB], extack); if (err) return err; } if (data[IFLA_BOND_MISSED_MAX]) { int missed_max = nla_get_u8(data[IFLA_BOND_MISSED_MAX]); bond_opt_initval(&newval, missed_max); err = __bond_opt_set(bond, BOND_OPT_MISSED_MAX, &newval, data[IFLA_BOND_MISSED_MAX], extack); if (err) return err; } if (data[IFLA_BOND_COUPLED_CONTROL]) { int coupled_control = nla_get_u8(data[IFLA_BOND_COUPLED_CONTROL]); bond_opt_initval(&newval, coupled_control); err = __bond_opt_set(bond, BOND_OPT_COUPLED_CONTROL, &newval, data[IFLA_BOND_COUPLED_CONTROL], extack); if (err) return err; } if (data[IFLA_BOND_BROADCAST_NEIGH]) { int broadcast_neigh = nla_get_u8(data[IFLA_BOND_BROADCAST_NEIGH]); bond_opt_initval(&newval, broadcast_neigh); err = __bond_opt_set(bond, BOND_OPT_BROADCAST_NEIGH, &newval, data[IFLA_BOND_BROADCAST_NEIGH], extack); if (err) return err; } return 0; } static int bond_newlink(struct net_device *bond_dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(bond_dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; int err; err = register_netdevice(bond_dev); if (err) return err; netif_carrier_off(bond_dev); bond_work_init_all(bond); err = bond_changelink(bond_dev, tb, data, extack); if (err) { bond_work_cancel_all(bond); unregister_netdevice(bond_dev); } return err; } static size_t bond_get_size(const struct net_device *bond_dev) { return nla_total_size(sizeof(u8)) + /* IFLA_BOND_MODE */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_ACTIVE_SLAVE */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_MIIMON */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_UPDELAY */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_DOWNDELAY */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_USE_CARRIER */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_ARP_INTERVAL */ /* IFLA_BOND_ARP_IP_TARGET */ nla_total_size(sizeof(struct nlattr)) + nla_total_size(sizeof(u32)) * BOND_MAX_ARP_TARGETS + nla_total_size(sizeof(u32)) + /* IFLA_BOND_ARP_VALIDATE */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_ARP_ALL_TARGETS */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_PRIMARY */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_PRIMARY_RESELECT */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_FAIL_OVER_MAC */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_XMIT_HASH_POLICY */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_RESEND_IGMP */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_NUM_PEER_NOTIF */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_ALL_SLAVES_ACTIVE */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_MIN_LINKS */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_LP_INTERVAL */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_PACKETS_PER_SLAVE */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_AD_LACP_ACTIVE */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_AD_LACP_RATE */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_AD_SELECT */ nla_total_size(sizeof(struct nlattr)) + /* IFLA_BOND_AD_INFO */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_INFO_AGGREGATOR */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_INFO_NUM_PORTS */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_INFO_ACTOR_KEY */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_INFO_PARTNER_KEY*/ nla_total_size(ETH_ALEN) + /* IFLA_BOND_AD_INFO_PARTNER_MAC*/ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_ACTOR_SYS_PRIO */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_USER_PORT_KEY */ nla_total_size(ETH_ALEN) + /* IFLA_BOND_AD_ACTOR_SYSTEM */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_TLB_DYNAMIC_LB */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_PEER_NOTIF_DELAY */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_MISSED_MAX */ /* IFLA_BOND_NS_IP6_TARGET */ nla_total_size(sizeof(struct nlattr)) + nla_total_size(sizeof(struct in6_addr)) * BOND_MAX_NS_TARGETS + nla_total_size(sizeof(u8)) + /* IFLA_BOND_COUPLED_CONTROL */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_BROADCAST_NEIGH */ 0; } static int bond_option_active_slave_get_ifindex(struct bonding *bond) { const struct net_device *slave; int ifindex; rcu_read_lock(); slave = bond_option_active_slave_get_rcu(bond); ifindex = slave ? slave->ifindex : 0; rcu_read_unlock(); return ifindex; } static int bond_fill_info(struct sk_buff *skb, const struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); unsigned int packets_per_slave; int ifindex, i, targets_added; struct nlattr *targets; struct slave *primary; if (nla_put_u8(skb, IFLA_BOND_MODE, BOND_MODE(bond))) goto nla_put_failure; ifindex = bond_option_active_slave_get_ifindex(bond); if (ifindex && nla_put_u32(skb, IFLA_BOND_ACTIVE_SLAVE, ifindex)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_MIIMON, bond->params.miimon)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_UPDELAY, bond->params.updelay * bond->params.miimon)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_DOWNDELAY, bond->params.downdelay * bond->params.miimon)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_PEER_NOTIF_DELAY, bond->params.peer_notif_delay * bond->params.miimon)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_USE_CARRIER, 1)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_ARP_INTERVAL, bond->params.arp_interval)) goto nla_put_failure; targets = nla_nest_start_noflag(skb, IFLA_BOND_ARP_IP_TARGET); if (!targets) goto nla_put_failure; targets_added = 0; for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) { if (bond->params.arp_targets[i]) { if (nla_put_be32(skb, i, bond->params.arp_targets[i])) goto nla_put_failure; targets_added = 1; } } if (targets_added) nla_nest_end(skb, targets); else nla_nest_cancel(skb, targets); if (nla_put_u32(skb, IFLA_BOND_ARP_VALIDATE, bond->params.arp_validate)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_ARP_ALL_TARGETS, bond->params.arp_all_targets)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) targets = nla_nest_start(skb, IFLA_BOND_NS_IP6_TARGET); if (!targets) goto nla_put_failure; targets_added = 0; for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { if (!ipv6_addr_any(&bond->params.ns_targets[i])) { if (nla_put_in6_addr(skb, i, &bond->params.ns_targets[i])) goto nla_put_failure; targets_added = 1; } } if (targets_added) nla_nest_end(skb, targets); else nla_nest_cancel(skb, targets); #endif primary = rtnl_dereference(bond->primary_slave); if (primary && nla_put_u32(skb, IFLA_BOND_PRIMARY, primary->dev->ifindex)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_PRIMARY_RESELECT, bond->params.primary_reselect)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_FAIL_OVER_MAC, bond->params.fail_over_mac)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_XMIT_HASH_POLICY, bond->params.xmit_policy)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_RESEND_IGMP, bond->params.resend_igmp)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_NUM_PEER_NOTIF, bond->params.num_peer_notif)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_ALL_SLAVES_ACTIVE, bond->params.all_slaves_active)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_MIN_LINKS, bond->params.min_links)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_LP_INTERVAL, bond->params.lp_interval)) goto nla_put_failure; packets_per_slave = bond->params.packets_per_slave; if (nla_put_u32(skb, IFLA_BOND_PACKETS_PER_SLAVE, packets_per_slave)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_AD_LACP_ACTIVE, bond->params.lacp_active)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_AD_LACP_RATE, bond->params.lacp_fast)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_AD_SELECT, bond->params.ad_select)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_TLB_DYNAMIC_LB, bond->params.tlb_dynamic_lb)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_MISSED_MAX, bond->params.missed_max)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_COUPLED_CONTROL, bond->params.coupled_control)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_BROADCAST_NEIGH, bond->params.broadcast_neighbor)) goto nla_put_failure; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info info; if (capable(CAP_NET_ADMIN)) { if (nla_put_u16(skb, IFLA_BOND_AD_ACTOR_SYS_PRIO, bond->params.ad_actor_sys_prio)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_AD_USER_PORT_KEY, bond->params.ad_user_port_key)) goto nla_put_failure; if (nla_put(skb, IFLA_BOND_AD_ACTOR_SYSTEM, ETH_ALEN, &bond->params.ad_actor_system)) goto nla_put_failure; } if (!bond_3ad_get_active_agg_info(bond, &info)) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, IFLA_BOND_AD_INFO); if (!nest) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_AD_INFO_AGGREGATOR, info.aggregator_id)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_AD_INFO_NUM_PORTS, info.ports)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_AD_INFO_ACTOR_KEY, info.actor_key)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_AD_INFO_PARTNER_KEY, info.partner_key)) goto nla_put_failure; if (nla_put(skb, IFLA_BOND_AD_INFO_PARTNER_MAC, sizeof(info.partner_system), &info.partner_system)) goto nla_put_failure; nla_nest_end(skb, nest); } } return 0; nla_put_failure: return -EMSGSIZE; } static size_t bond_get_linkxstats_size(const struct net_device *dev, int attr) { switch (attr) { case IFLA_STATS_LINK_XSTATS: case IFLA_STATS_LINK_XSTATS_SLAVE: break; default: return 0; } return bond_3ad_stats_size() + nla_total_size(0); } static int bond_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, int *prividx, int attr) { struct nlattr *nla __maybe_unused; struct slave *slave = NULL; struct nlattr *nest, *nest2; struct bonding *bond; switch (attr) { case IFLA_STATS_LINK_XSTATS: bond = netdev_priv(dev); break; case IFLA_STATS_LINK_XSTATS_SLAVE: slave = bond_slave_get_rtnl(dev); if (!slave) return 0; bond = slave->bond; break; default: return -EINVAL; } nest = nla_nest_start_noflag(skb, LINK_XSTATS_TYPE_BOND); if (!nest) return -EMSGSIZE; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct bond_3ad_stats *stats; if (slave) stats = &SLAVE_AD_INFO(slave)->stats; else stats = &BOND_AD_INFO(bond).stats; nest2 = nla_nest_start_noflag(skb, BOND_XSTATS_3AD); if (!nest2) { nla_nest_end(skb, nest); return -EMSGSIZE; } if (bond_3ad_stats_fill(skb, stats)) { nla_nest_cancel(skb, nest2); nla_nest_end(skb, nest); return -EMSGSIZE; } nla_nest_end(skb, nest2); } nla_nest_end(skb, nest); return 0; } struct rtnl_link_ops bond_link_ops __read_mostly = { .kind = "bond", .priv_size = sizeof(struct bonding), .setup = bond_setup, .maxtype = IFLA_BOND_MAX, .policy = bond_policy, .validate = bond_validate, .newlink = bond_newlink, .changelink = bond_changelink, .get_size = bond_get_size, .fill_info = bond_fill_info, .get_num_tx_queues = bond_get_num_tx_queues, .get_num_rx_queues = bond_get_num_tx_queues, /* Use the same number as for TX queues */ .fill_linkxstats = bond_fill_linkxstats, .get_linkxstats_size = bond_get_linkxstats_size, .slave_maxtype = IFLA_BOND_SLAVE_MAX, .slave_policy = bond_slave_policy, .slave_changelink = bond_slave_changelink, .get_slave_size = bond_get_slave_size, .fill_slave_info = bond_fill_slave_info, }; int __init bond_netlink_init(void) { return rtnl_link_register(&bond_link_ops); } void bond_netlink_fini(void) { rtnl_link_unregister(&bond_link_ops); } MODULE_ALIAS_RTNL_LINK("bond");
453 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_DRIVER_H #define _LINUX_TTY_DRIVER_H #include <linux/export.h> #include <linux/fs.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/cdev.h> #include <linux/uaccess.h> #include <linux/termios.h> #include <linux/seq_file.h> struct tty_struct; struct tty_driver; struct serial_icounter_struct; struct serial_struct; /** * enum tty_driver_flag -- TTY Driver Flags * * These are flags passed to tty_alloc_driver(). * * @TTY_DRIVER_INSTALLED: * Whether this driver was succesfully installed. This is a tty internal * flag. Do not touch. * * @TTY_DRIVER_RESET_TERMIOS: * Requests the tty layer to reset the termios setting when the last * process has closed the device. Used for PTYs, in particular. * * @TTY_DRIVER_REAL_RAW: * Indicates that the driver will guarantee not to set any special * character handling flags if this is set for the tty: * * ``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)`` * * That is, if there is no reason for the driver to * send notifications of parity and break characters up to the line * driver, it won't do so. This allows the line driver to optimize for * this case if this flag is set. (Note that there is also a promise, if * the above case is true, not to signal overruns, either.) * * @TTY_DRIVER_DYNAMIC_DEV: * The individual tty devices need to be registered with a call to * tty_register_device() when the device is found in the system and * unregistered with a call to tty_unregister_device() so the devices will * be show up properly in sysfs. If not set, all &tty_driver.num entries * will be created by the tty core in sysfs when tty_register_driver() is * called. This is to be used by drivers that have tty devices that can * appear and disappear while the main tty driver is registered with the * tty core. * * @TTY_DRIVER_DEVPTS_MEM: * Don't use the standard arrays (&tty_driver.ttys and * &tty_driver.termios), instead use dynamic memory keyed through the * devpts filesystem. This is only applicable to the PTY driver. * * @TTY_DRIVER_HARDWARE_BREAK: * Hardware handles break signals. Pass the requested timeout to the * &tty_operations.break_ctl instead of using a simple on/off interface. * * @TTY_DRIVER_DYNAMIC_ALLOC: * Do not allocate structures which are needed per line for this driver * (&tty_driver.ports) as it would waste memory. The driver will take * care. This is only applicable to the PTY driver. * * @TTY_DRIVER_UNNUMBERED_NODE: * Do not create numbered ``/dev`` nodes. For example, create * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a * driver for a single tty device is being allocated. */ enum tty_driver_flag { TTY_DRIVER_INSTALLED = BIT(0), TTY_DRIVER_RESET_TERMIOS = BIT(1), TTY_DRIVER_REAL_RAW = BIT(2), TTY_DRIVER_DYNAMIC_DEV = BIT(3), TTY_DRIVER_DEVPTS_MEM = BIT(4), TTY_DRIVER_HARDWARE_BREAK = BIT(5), TTY_DRIVER_DYNAMIC_ALLOC = BIT(6), TTY_DRIVER_UNNUMBERED_NODE = BIT(7), }; enum tty_driver_type { TTY_DRIVER_TYPE_SYSTEM, TTY_DRIVER_TYPE_CONSOLE, TTY_DRIVER_TYPE_SERIAL, TTY_DRIVER_TYPE_PTY, TTY_DRIVER_TYPE_SCC, TTY_DRIVER_TYPE_SYSCONS, }; enum tty_driver_subtype { SYSTEM_TYPE_TTY = 1, SYSTEM_TYPE_CONSOLE, SYSTEM_TYPE_SYSCONS, SYSTEM_TYPE_SYSPTMX, PTY_TYPE_MASTER = 1, PTY_TYPE_SLAVE, SERIAL_TYPE_NORMAL = 1, }; /** * struct tty_operations -- interface between driver and tty * * @lookup: ``struct tty_struct *()(struct tty_driver *self, struct file *, * int idx)`` * * Return the tty device corresponding to @idx, %NULL if there is not * one currently in use and an %ERR_PTR value on error. Called under * %tty_mutex (for now!) * * Optional method. Default behaviour is to use the @self->ttys array. * * @install: ``int ()(struct tty_driver *self, struct tty_struct *tty)`` * * Install a new @tty into the @self's internal tables. Used in * conjunction with @lookup and @remove methods. * * Optional method. Default behaviour is to use the @self->ttys array. * * @remove: ``void ()(struct tty_driver *self, struct tty_struct *tty)`` * * Remove a closed @tty from the @self's internal tables. Used in * conjunction with @lookup and @remove methods. * * Optional method. Default behaviour is to use the @self->ttys array. * * @open: ``int ()(struct tty_struct *tty, struct file *)`` * * This routine is called when a particular @tty device is opened. This * routine is mandatory; if this routine is not filled in, the attempted * open will fail with %ENODEV. * * Required method. Called with tty lock held. May sleep. * * @close: ``void ()(struct tty_struct *tty, struct file *)`` * * This routine is called when a particular @tty device is closed. At the * point of return from this call the driver must make no further ldisc * calls of any kind. * * Remark: called even if the corresponding @open() failed. * * Required method. Called with tty lock held. May sleep. * * @shutdown: ``void ()(struct tty_struct *tty)`` * * This routine is called under the tty lock when a particular @tty device * is closed for the last time. It executes before the @tty resources * are freed so may execute while another function holds a @tty kref. * * @cleanup: ``void ()(struct tty_struct *tty)`` * * This routine is called asynchronously when a particular @tty device * is closed for the last time freeing up the resources. This is * actually the second part of shutdown for routines that might sleep. * * @write: ``ssize_t ()(struct tty_struct *tty, const u8 *buf, size_t count)`` * * This routine is called by the kernel to write a series (@count) of * characters (@buf) to the @tty device. The characters may come from * user space or kernel space. This routine will return the * number of characters actually accepted for writing. * * May occur in parallel in special cases. Because this includes panic * paths drivers generally shouldn't try and do clever locking here. * * Optional: Required for writable devices. May not sleep. * * @put_char: ``int ()(struct tty_struct *tty, u8 ch)`` * * This routine is called by the kernel to write a single character @ch to * the @tty device. If the kernel uses this routine, it must call the * @flush_chars() routine (if defined) when it is done stuffing characters * into the driver. If there is no room in the queue, the character is * ignored. * * Optional: Kernel will use the @write method if not provided. Do not * call this function directly, call tty_put_char(). * * @flush_chars: ``void ()(struct tty_struct *tty)`` * * This routine is called by the kernel after it has written a * series of characters to the tty device using @put_char(). * * Optional. Do not call this function directly, call * tty_driver_flush_chars(). * * @write_room: ``unsigned int ()(struct tty_struct *tty)`` * * This routine returns the numbers of characters the @tty driver * will accept for queuing to be written. This number is subject * to change as output buffers get emptied, or if the output flow * control is acted. * * The ldisc is responsible for being intelligent about multi-threading of * write_room/write calls * * Required if @write method is provided else not needed. Do not call this * function directly, call tty_write_room() * * @chars_in_buffer: ``unsigned int ()(struct tty_struct *tty)`` * * This routine returns the number of characters in the device private * output queue. Used in tty_wait_until_sent() and for poll() * implementation. * * Optional: if not provided, it is assumed there is no queue on the * device. Do not call this function directly, call tty_chars_in_buffer(). * * @ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd, * unsigned long arg)`` * * This routine allows the @tty driver to implement device-specific * ioctls. If the ioctl number passed in @cmd is not recognized by the * driver, it should return %ENOIOCTLCMD. * * Optional. * * @compat_ioctl: ``long ()(struct tty_struct *tty, unsigned int cmd, * unsigned long arg)`` * * Implement ioctl processing for 32 bit process on 64 bit system. * * Optional. * * @set_termios: ``void ()(struct tty_struct *tty, const struct ktermios *old)`` * * This routine allows the @tty driver to be notified when device's * termios settings have changed. New settings are in @tty->termios. * Previous settings are passed in the @old argument. * * The API is defined such that the driver should return the actual modes * selected. This means that the driver is responsible for modifying any * bits in @tty->termios it cannot fulfill to indicate the actual modes * being used. * * Optional. Called under the @tty->termios_rwsem. May sleep. * * @ldisc_ok: ``int ()(struct tty_struct *tty, int ldisc)`` * * This routine allows the @tty driver to decide if it can deal * with a particular @ldisc. * * Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem. * * @set_ldisc: ``void ()(struct tty_struct *tty)`` * * This routine allows the @tty driver to be notified when the device's * line discipline is being changed. At the point this is done the * discipline is not yet usable. * * Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem. * * @throttle: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that input buffers for the line * discipline are close to full, and it should somehow signal that no more * characters should be sent to the @tty. * * Serialization including with @unthrottle() is the job of the ldisc * layer. * * Optional: Always invoke via tty_throttle_safe(). Called under the * @tty->termios_rwsem. * * @unthrottle: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should signal that * characters can now be sent to the @tty without fear of overrunning the * input buffers of the line disciplines. * * Optional. Always invoke via tty_unthrottle(). Called under the * @tty->termios_rwsem. * * @stop: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should stop outputting * characters to the tty device. * * Called with @tty->flow.lock held. Serialized with @start() method. * * Optional. Always invoke via stop_tty(). * * @start: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it resumed sending * characters to the @tty device. * * Called with @tty->flow.lock held. Serialized with stop() method. * * Optional. Always invoke via start_tty(). * * @hangup: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should hang up the @tty * device. * * Optional. Called with tty lock held. * * @break_ctl: ``int ()(struct tty_struct *tty, int state)`` * * This optional routine requests the @tty driver to turn on or off BREAK * status on the RS-232 port. If @state is -1, then the BREAK status * should be turned on; if @state is 0, then BREAK should be turned off. * * If this routine is implemented, the high-level tty driver will handle * the following ioctls: %TCSBRK, %TCSBRKP, %TIOCSBRK, %TIOCCBRK. * * If the driver sets %TTY_DRIVER_HARDWARE_BREAK in tty_alloc_driver(), * then the interface will also be called with actual times and the * hardware is expected to do the delay work itself. 0 and -1 are still * used for on/off. * * Optional: Required for %TCSBRK/%BRKP/etc. handling. May sleep. * * @flush_buffer: ``void ()(struct tty_struct *tty)`` * * This routine discards device private output buffer. Invoked on close, * hangup, to implement %TCOFLUSH ioctl and similar. * * Optional: if not provided, it is assumed there is no queue on the * device. Do not call this function directly, call * tty_driver_flush_buffer(). * * @wait_until_sent: ``void ()(struct tty_struct *tty, int timeout)`` * * This routine waits until the device has written out all of the * characters in its transmitter FIFO. Or until @timeout (in jiffies) is * reached. * * Optional: If not provided, the device is assumed to have no FIFO. * Usually correct to invoke via tty_wait_until_sent(). May sleep. * * @send_xchar: ``void ()(struct tty_struct *tty, u8 ch)`` * * This routine is used to send a high-priority XON/XOFF character (@ch) * to the @tty device. * * Optional: If not provided, then the @write method is called under * the @tty->atomic_write_lock to keep it serialized with the ldisc. * * @tiocmget: ``int ()(struct tty_struct *tty)`` * * This routine is used to obtain the modem status bits from the @tty * driver. * * Optional: If not provided, then %ENOTTY is returned from the %TIOCMGET * ioctl. Do not call this function directly, call tty_tiocmget(). * * @tiocmset: ``int ()(struct tty_struct *tty, * unsigned int set, unsigned int clear)`` * * This routine is used to set the modem status bits to the @tty driver. * First, @clear bits should be cleared, then @set bits set. * * Optional: If not provided, then %ENOTTY is returned from the %TIOCMSET * ioctl. Do not call this function directly, call tty_tiocmset(). * * @resize: ``int ()(struct tty_struct *tty, struct winsize *ws)`` * * Called when a termios request is issued which changes the requested * terminal geometry to @ws. * * Optional: the default action is to update the termios structure * without error. This is usually the correct behaviour. Drivers should * not force errors here if they are not resizable objects (e.g. a serial * line). See tty_do_resize() if you need to wrap the standard method * in your own logic -- the usual case. * * @get_icount: ``int ()(struct tty_struct *tty, * struct serial_icounter *icount)`` * * Called when the @tty device receives a %TIOCGICOUNT ioctl. Passed a * kernel structure @icount to complete. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * * @get_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)`` * * Called when the @tty device receives a %TIOCGSERIAL ioctl. Passed a * kernel structure @p (&struct serial_struct) to complete. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * Do not call this function directly, call tty_tiocgserial(). * * @set_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)`` * * Called when the @tty device receives a %TIOCSSERIAL ioctl. Passed a * kernel structure @p (&struct serial_struct) to set the values from. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * Do not call this function directly, call tty_tiocsserial(). * * @show_fdinfo: ``void ()(struct tty_struct *tty, struct seq_file *m)`` * * Called when the @tty device file descriptor receives a fdinfo request * from VFS (to show in /proc/<pid>/fdinfo/). @m should be filled with * information. * * Optional: called only if provided, otherwise nothing is written to @m. * Do not call this function directly, call tty_show_fdinfo(). * * @poll_init: ``int ()(struct tty_driver *driver, int line, char *options)`` * * kgdboc support (Documentation/process/debugging/kgdb.rst). This routine is * called to initialize the HW for later use by calling @poll_get_char or * @poll_put_char. * * Optional: called only if provided, otherwise skipped as a non-polling * driver. * * @poll_get_char: ``int ()(struct tty_driver *driver, int line)`` * * kgdboc support (see @poll_init). @driver should read a character from a * tty identified by @line and return it. * * Optional: called only if @poll_init provided. * * @poll_put_char: ``void ()(struct tty_driver *driver, int line, char ch)`` * * kgdboc support (see @poll_init). @driver should write character @ch to * a tty identified by @line. * * Optional: called only if @poll_init provided. * * @proc_show: ``int ()(struct seq_file *m, void *driver)`` * * Driver @driver (cast to &struct tty_driver) can show additional info in * /proc/tty/driver/<driver_name>. It is enough to fill in the information * into @m. * * Optional: called only if provided, otherwise no /proc entry created. * * This structure defines the interface between the low-level tty driver and * the tty routines. These routines can be defined. Unless noted otherwise, * they are optional, and can be filled in with a %NULL pointer. */ struct tty_operations { struct tty_struct * (*lookup)(struct tty_driver *driver, struct file *filp, int idx); int (*install)(struct tty_driver *driver, struct tty_struct *tty); void (*remove)(struct tty_driver *driver, struct tty_struct *tty); int (*open)(struct tty_struct * tty, struct file * filp); void (*close)(struct tty_struct * tty, struct file * filp); void (*shutdown)(struct tty_struct *tty); void (*cleanup)(struct tty_struct *tty); ssize_t (*write)(struct tty_struct *tty, const u8 *buf, size_t count); int (*put_char)(struct tty_struct *tty, u8 ch); void (*flush_chars)(struct tty_struct *tty); unsigned int (*write_room)(struct tty_struct *tty); unsigned int (*chars_in_buffer)(struct tty_struct *tty); int (*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); long (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); void (*set_termios)(struct tty_struct *tty, const struct ktermios *old); void (*throttle)(struct tty_struct * tty); void (*unthrottle)(struct tty_struct * tty); void (*stop)(struct tty_struct *tty); void (*start)(struct tty_struct *tty); void (*hangup)(struct tty_struct *tty); int (*break_ctl)(struct tty_struct *tty, int state); void (*flush_buffer)(struct tty_struct *tty); int (*ldisc_ok)(struct tty_struct *tty, int ldisc); void (*set_ldisc)(struct tty_struct *tty); void (*wait_until_sent)(struct tty_struct *tty, int timeout); void (*send_xchar)(struct tty_struct *tty, u8 ch); int (*tiocmget)(struct tty_struct *tty); int (*tiocmset)(struct tty_struct *tty, unsigned int set, unsigned int clear); int (*resize)(struct tty_struct *tty, struct winsize *ws); int (*get_icount)(struct tty_struct *tty, struct serial_icounter_struct *icount); int (*get_serial)(struct tty_struct *tty, struct serial_struct *p); int (*set_serial)(struct tty_struct *tty, struct serial_struct *p); void (*show_fdinfo)(struct tty_struct *tty, struct seq_file *m); #ifdef CONFIG_CONSOLE_POLL int (*poll_init)(struct tty_driver *driver, int line, char *options); int (*poll_get_char)(struct tty_driver *driver, int line); void (*poll_put_char)(struct tty_driver *driver, int line, char ch); #endif int (*proc_show)(struct seq_file *m, void *driver); } __randomize_layout; /** * struct tty_driver -- driver for TTY devices * * @kref: reference counting. Reaching zero frees all the internals and the * driver. * @cdevs: allocated/registered character /dev devices * @owner: modules owning this driver. Used drivers cannot be rmmod'ed. * Automatically set by tty_alloc_driver(). * @driver_name: name of the driver used in /proc/tty * @name: used for constructing /dev node name * @name_base: used as a number base for constructing /dev node name * @major: major /dev device number (zero for autoassignment) * @minor_start: the first minor /dev device number * @num: number of devices allocated * @type: type of tty driver (enum tty_driver_type) * @subtype: subtype of tty driver (enum tty_driver_subtype) * @init_termios: termios to set to each tty initially (e.g. %tty_std_termios) * @flags: tty driver flags (%TTY_DRIVER_) * @proc_entry: proc fs entry, used internally * @other: driver of the linked tty; only used for the PTY driver * @ttys: array of active &struct tty_struct, set by tty_standard_install() * @ports: array of &struct tty_port; can be set during initialization by * tty_port_link_device() and similar * @termios: storage for termios at each TTY close for the next open * @driver_state: pointer to driver's arbitrary data * @ops: driver hooks for TTYs. Set them using tty_set_operations(). Use &struct * tty_port helpers in them as much as possible. * @tty_drivers: used internally to link tty_drivers together * * The usual handling of &struct tty_driver is to allocate it by * tty_alloc_driver(), set up all the necessary members, and register it by * tty_register_driver(). At last, the driver is torn down by calling * tty_unregister_driver() followed by tty_driver_kref_put(). * * The fields required to be set before calling tty_register_driver() include * @driver_name, @name, @type, @subtype, @init_termios, and @ops. */ struct tty_driver { struct kref kref; struct cdev **cdevs; struct module *owner; const char *driver_name; const char *name; int name_base; int major; int minor_start; unsigned int num; enum tty_driver_type type; enum tty_driver_subtype subtype; struct ktermios init_termios; unsigned long flags; struct proc_dir_entry *proc_entry; struct tty_driver *other; /* * Pointer to the tty data structures */ struct tty_struct **ttys; struct tty_port **ports; struct ktermios **termios; void *driver_state; /* * Driver methods */ const struct tty_operations *ops; struct list_head tty_drivers; } __randomize_layout; extern struct list_head tty_drivers; struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner, unsigned long flags); struct tty_driver *tty_find_polling_driver(char *name, int *line); void tty_driver_kref_put(struct tty_driver *driver); /** * tty_alloc_driver - allocate tty driver * @lines: count of lines this driver can handle at most * @flags: some of enum tty_driver_flag, will be set in driver->flags * * Returns: struct tty_driver or a PTR-encoded error (use IS_ERR() and friends). */ #define tty_alloc_driver(lines, flags) \ __tty_alloc_driver(lines, THIS_MODULE, flags) static inline struct tty_driver *tty_driver_kref_get(struct tty_driver *d) { kref_get(&d->kref); return d; } static inline void tty_set_operations(struct tty_driver *driver, const struct tty_operations *op) { driver->ops = op; } int tty_register_driver(struct tty_driver *driver); void tty_unregister_driver(struct tty_driver *driver); struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev); struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); void tty_unregister_device(struct tty_driver *driver, unsigned index); #ifdef CONFIG_PROC_FS void proc_tty_register_driver(struct tty_driver *); void proc_tty_unregister_driver(struct tty_driver *); #else static inline void proc_tty_register_driver(struct tty_driver *d) {} static inline void proc_tty_unregister_driver(struct tty_driver *d) {} #endif #endif /* #ifdef _LINUX_TTY_DRIVER_H */
11861 2 233 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fs.h> #define DEVCG_ACC_MKNOD 1 #define DEVCG_ACC_READ 2 #define DEVCG_ACC_WRITE 4 #define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE) #define DEVCG_DEV_BLOCK 1 #define DEVCG_DEV_CHAR 2 #define DEVCG_DEV_ALL 4 /* this represents all devices */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access); static inline int devcgroup_inode_permission(struct inode *inode, int mask) { short type, access = 0; if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))) return 0; if (likely(!inode->i_rdev)) return 0; if (S_ISBLK(inode->i_mode)) type = DEVCG_DEV_BLOCK; else /* S_ISCHR by the test above */ type = DEVCG_DEV_CHAR; if (mask & MAY_WRITE) access |= DEVCG_ACC_WRITE; if (mask & MAY_READ) access |= DEVCG_ACC_READ; return devcgroup_check_permission(type, imajor(inode), iminor(inode), access); } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { short type; if (!S_ISBLK(mode) && !S_ISCHR(mode)) return 0; if (S_ISCHR(mode) && dev == WHITEOUT_DEV) return 0; if (S_ISBLK(mode)) type = DEVCG_DEV_BLOCK; else type = DEVCG_DEV_CHAR; return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), DEVCG_ACC_MKNOD); } #else static inline int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { return 0; } static inline int devcgroup_inode_permission(struct inode *inode, int mask) { return 0; } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { return 0; } #endif
57 57 25 53 53 53 398 398 398 389 389 391 390 391 391 320 64 370 93 65 245 108 378 29 391 391 354 17 2 17 1 1 1 113 113 258 88 94 255 110 113 113 4 110 110 4 106 34 110 4 4 110 113 1 1 65 5 2 64 64 388 292 167 390 325 156 27 390 3 388 4 389 11 121 295 390 388 4 151 341 389 390 1 67 267 2 1 110 110 349 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 391 267 1 176 253 173 61 34 67 39 51 61 62 87 341 341 339 341 354 10 2 2 354 1 354 355 13 354 354 354 29 39 341 340 65 325 29 43 340 341 341 117 246 64 1 4 157 157 65 93 157 157 157 5 157 67 93 151 41 41 157 398 335 355 157 157 157 150 41 341 341 341 398 60 60 398 354 341 355 355 45 355 38 341 340 1 341 398 398 398 398 84 84 397 397 398 341 23 355 354 279 144 320 398 384 383 384 349 349 349 1 2 42 52 52 42 9 43 43 52 60 54 57 1 8 5 52 52 1 51 57 40 32 30 14 1 20 21 40 74 41 40 40 133 1 2 6 8 8 143 142 2 60 78 143 143 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 // SPDX-License-Identifier: GPL-2.0 /* * Memory Migration functionality - linux/mm/migrate.c * * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter * * Page migration was first developed in the context of the memory hotplug * project. The main authors of the migration code are: * * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> * Hirokazu Takahashi <taka@valinux.co.jp> * Dave Hansen <haveblue@us.ibm.com> * Christoph Lameter */ #include <linux/migrate.h> #include <linux/export.h> #include <linux/swap.h> #include <linux/leafops.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/mm_inline.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/writeback.h> #include <linux/mempolicy.h> #include <linux/vmalloc.h> #include <linux/security.h> #include <linux/backing-dev.h> #include <linux/compaction.h> #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/hugetlb.h> #include <linux/gfp.h> #include <linux/page_idle.h> #include <linux/page_owner.h> #include <linux/sched/mm.h> #include <linux/ptrace.h> #include <linux/memory.h> #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/pagewalk.h> #include <asm/tlbflush.h> #include <trace/events/migrate.h> #include "internal.h" #include "swap.h" static const struct movable_operations *offline_movable_ops; static const struct movable_operations *zsmalloc_movable_ops; int set_movable_ops(const struct movable_operations *ops, enum pagetype type) { /* * We only allow for selected types and don't handle concurrent * registration attempts yet. */ switch (type) { case PGTY_offline: if (offline_movable_ops && ops) return -EBUSY; offline_movable_ops = ops; break; case PGTY_zsmalloc: if (zsmalloc_movable_ops && ops) return -EBUSY; zsmalloc_movable_ops = ops; break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL_GPL(set_movable_ops); static const struct movable_operations *page_movable_ops(struct page *page) { VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); /* * If we enable page migration for a page of a certain type by marking * it as movable, the page type must be sticky until the page gets freed * back to the buddy. */ if (PageOffline(page)) /* Only balloon compaction sets PageOffline pages movable. */ return offline_movable_ops; if (PageZsmalloc(page)) return zsmalloc_movable_ops; return NULL; } /** * isolate_movable_ops_page - isolate a movable_ops page for migration * @page: The page. * @mode: The isolation mode. * * Try to isolate a movable_ops page for migration. Will fail if the page is * not a movable_ops page, if the page is already isolated for migration * or if the page was just was released by its owner. * * Once isolated, the page cannot get freed until it is either putback * or migrated. * * Returns true if isolation succeeded, otherwise false. */ bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode) { /* * TODO: these pages will not be folios in the future. All * folio dependencies will have to be removed. */ struct folio *folio = folio_get_nontail_page(page); const struct movable_operations *mops; /* * Avoid burning cycles with pages that are yet under __free_pages(), * or just got freed under us. * * In case we 'win' a race for a movable page being freed under us and * raise its refcount preventing __free_pages() from doing its job * the put_page() at the end of this block will take care of * release this page, thus avoiding a nasty leakage. */ if (!folio) goto out; /* * Check for movable_ops pages before taking the page lock because * we use non-atomic bitops on newly allocated page flags so * unconditionally grabbing the lock ruins page's owner side. * * Note that once a page has movable_ops, it will stay that way * until the page was freed. */ if (unlikely(!page_has_movable_ops(page))) goto out_putfolio; /* * As movable pages are not isolated from LRU lists, concurrent * compaction threads can race against page migration functions * as well as race against the releasing a page. * * In order to avoid having an already isolated movable page * being (wrongly) re-isolated while it is under migration, * or to avoid attempting to isolate pages being released, * lets be sure we have the page lock * before proceeding with the movable page isolation steps. */ if (unlikely(!folio_trylock(folio))) goto out_putfolio; VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); if (PageMovableOpsIsolated(page)) goto out_no_isolated; mops = page_movable_ops(page); if (WARN_ON_ONCE(!mops)) goto out_no_isolated; if (!mops->isolate_page(page, mode)) goto out_no_isolated; /* Driver shouldn't use the isolated flag */ VM_WARN_ON_ONCE_PAGE(PageMovableOpsIsolated(page), page); SetPageMovableOpsIsolated(page); folio_unlock(folio); return true; out_no_isolated: folio_unlock(folio); out_putfolio: folio_put(folio); out: return false; } /** * putback_movable_ops_page - putback an isolated movable_ops page * @page: The isolated page. * * Putback an isolated movable_ops page. * * After the page was putback, it might get freed instantly. */ static void putback_movable_ops_page(struct page *page) { /* * TODO: these pages will not be folios in the future. All * folio dependencies will have to be removed. */ struct folio *folio = page_folio(page); VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(page), page); folio_lock(folio); page_movable_ops(page)->putback_page(page); ClearPageMovableOpsIsolated(page); folio_unlock(folio); folio_put(folio); } /** * migrate_movable_ops_page - migrate an isolated movable_ops page * @dst: The destination page. * @src: The source page. * @mode: The migration mode. * * Migrate an isolated movable_ops page. * * If the src page was already released by its owner, the src page is * un-isolated (putback) and migration succeeds; the migration core will be the * owner of both pages. * * If the src page was not released by its owner and the migration was * successful, the owner of the src page and the dst page are swapped and * the src page is un-isolated. * * If migration fails, the ownership stays unmodified and the src page * remains isolated: migration may be retried later or the page can be putback. * * TODO: migration core will treat both pages as folios and lock them before * this call to unlock them after this call. Further, the folio refcounts on * src and dst are also released by migration core. These pages will not be * folios in the future, so that must be reworked. * * Returns 0 on success, otherwise a negative error code. */ static int migrate_movable_ops_page(struct page *dst, struct page *src, enum migrate_mode mode) { int rc; VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src); VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src); rc = page_movable_ops(src)->migrate_page(dst, src, mode); if (!rc) ClearPageMovableOpsIsolated(src); return rc; } /* * Put previously isolated pages back onto the appropriate lists * from where they were once taken off for compaction/migration. * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() * and folio_isolate_hugetlb(). */ void putback_movable_pages(struct list_head *l) { struct folio *folio; struct folio *folio2; list_for_each_entry_safe(folio, folio2, l, lru) { if (unlikely(folio_test_hugetlb(folio))) { folio_putback_hugetlb(folio); continue; } list_del(&folio->lru); if (unlikely(page_has_movable_ops(&folio->page))) { putback_movable_ops_page(&folio->page); } else { node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), -folio_nr_pages(folio)); folio_putback_lru(folio); } } } /* Must be called with an elevated refcount on the non-hugetlb folio */ bool isolate_folio_to_list(struct folio *folio, struct list_head *list) { if (folio_test_hugetlb(folio)) return folio_isolate_hugetlb(folio, list); if (page_has_movable_ops(&folio->page)) { if (!isolate_movable_ops_page(&folio->page, ISOLATE_UNEVICTABLE)) return false; } else { if (!folio_isolate_lru(folio)) return false; node_stat_add_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio)); } list_add(&folio->lru, list); return true; } static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, struct folio *folio, pte_t old_pte, unsigned long idx) { struct page *page = folio_page(folio, idx); pte_t newpte; if (PageCompound(page) || PageHWPoison(page)) return false; VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(pte_present(old_pte), page); VM_WARN_ON_ONCE_FOLIO(folio_is_device_private(folio), folio); if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) || mm_forbids_zeropage(pvmw->vma->vm_mm)) return false; /* * The pmd entry mapping the old thp was flushed and the pte mapping * this subpage has been non present. If the subpage is only zero-filled * then map it to the shared zeropage. */ if (!pages_identical(page, ZERO_PAGE(0))) return false; newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address), pvmw->vma->vm_page_prot)); if (pte_swp_soft_dirty(old_pte)) newpte = pte_mksoft_dirty(newpte); if (pte_swp_uffd_wp(old_pte)) newpte = pte_mkuffd_wp(newpte); set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio)); return true; } struct rmap_walk_arg { struct folio *folio; bool map_unused_to_zeropage; }; /* * Restore a potential migration pte to a working pte entry */ static bool remove_migration_pte(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg) { struct rmap_walk_arg *rmap_walk_arg = arg; DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; pte_t old_pte; pte_t pte; softleaf_t entry; struct page *new; unsigned long idx = 0; /* pgoff is invalid for ksm pages, but they are never large */ if (folio_test_large(folio) && !folio_test_hugetlb(folio)) idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff; new = folio_page(folio, idx); #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || !folio_test_pmd_mappable(folio), folio); remove_migration_pmd(&pvmw, new); continue; } #endif old_pte = ptep_get(pvmw.pte); if (rmap_walk_arg->map_unused_to_zeropage && try_to_map_unused_to_zeropage(&pvmw, folio, old_pte, idx)) continue; folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); entry = softleaf_from_pte(old_pte); if (!softleaf_is_migration_young(entry)) pte = pte_mkold(pte); if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry)) pte = pte_mkdirty(pte); if (pte_swp_soft_dirty(old_pte)) pte = pte_mksoft_dirty(pte); else pte = pte_clear_soft_dirty(pte); if (softleaf_is_migration_write(entry)) pte = pte_mkwrite(pte, vma); else if (pte_swp_uffd_wp(old_pte)) pte = pte_mkuffd_wp(pte); if (folio_test_anon(folio) && !softleaf_is_migration_read(entry)) rmap_flags |= RMAP_EXCLUSIVE; if (unlikely(is_device_private_page(new))) { if (pte_write(pte)) entry = make_writable_device_private_entry( page_to_pfn(new)); else entry = make_readable_device_private_entry( page_to_pfn(new)); pte = softleaf_to_pte(entry); if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(old_pte)) pte = pte_swp_mkuffd_wp(pte); } #ifdef CONFIG_HUGETLB_PAGE if (folio_test_hugetlb(folio)) { struct hstate *h = hstate_vma(vma); unsigned int shift = huge_page_shift(h); unsigned long psize = huge_page_size(h); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (folio_test_anon(folio)) hugetlb_add_anon_rmap(folio, vma, pvmw.address, rmap_flags); else hugetlb_add_file_rmap(folio); set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte, psize); } else #endif { if (folio_test_anon(folio)) folio_add_anon_rmap_pte(folio, new, vma, pvmw.address, rmap_flags); else folio_add_file_rmap_pte(folio, new, vma); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (READ_ONCE(vma->vm_flags) & VM_LOCKED) mlock_drain_local(); trace_remove_migration_pte(pvmw.address, pte_val(pte), compound_order(new)); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); } return true; } /* * Get rid of all migration entries and replace them by * references to the indicated page. */ void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) { struct rmap_walk_arg rmap_walk_arg = { .folio = src, .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE, }; struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = &rmap_walk_arg, }; VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src); if (flags & RMP_LOCKED) rmap_walk_locked(dst, &rwc); else rmap_walk(dst, &rwc); } /* * Something used the pte of a page under migration. We need to * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { spinlock_t *ptl; pte_t *ptep; pte_t pte; softleaf_t entry; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return; pte = ptep_get(ptep); pte_unmap(ptep); if (pte_none(pte) || pte_present(pte)) goto out; entry = softleaf_from_pte(pte); if (!softleaf_is_migration(entry)) goto out; migration_entry_wait_on_locked(entry, ptl); return; out: spin_unlock(ptl); } #ifdef CONFIG_HUGETLB_PAGE /* * The vma read lock must be held upon entry. Holding that lock prevents either * the pte or the ptl from being freed. * * This function will release the vma lock before returning. */ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep); softleaf_t entry; pte_t pte; hugetlb_vma_assert_locked(vma); spin_lock(ptl); pte = huge_ptep_get(vma->vm_mm, addr, ptep); if (huge_pte_none(pte)) goto fail; entry = softleaf_from_pte(pte); if (softleaf_is_migration(entry)) { /* * If migration entry existed, safe to release vma lock * here because the pgtable page won't be freed without the * pgtable lock released. See comment right above pgtable * lock release in migration_entry_wait_on_locked(). */ hugetlb_vma_unlock_read(vma); migration_entry_wait_on_locked(entry, ptl); return; } fail: spin_unlock(ptl); hugetlb_vma_unlock_read(vma); } #endif #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; ptl = pmd_lock(mm, pmd); if (!pmd_is_migration_entry(*pmd)) goto unlock; migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); return; unlock: spin_unlock(ptl); } #endif /* * Replace the folio in the mapping. * * The number of remaining references must be: * 1 for anonymous folios without a mapping * 2 for folios with a mapping * 3 for folios with a mapping and the private flag set. */ static int __folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int expected_count) { XA_STATE(xas, &mapping->i_pages, folio->index); struct swap_cluster_info *ci = NULL; struct zone *oldzone, *newzone; int dirty; long nr = folio_nr_pages(folio); if (!mapping) { /* Take off deferred split queue while frozen and memcg set */ if (folio_test_large(folio) && folio_test_large_rmappable(folio)) { if (!folio_ref_freeze(folio, expected_count)) return -EAGAIN; folio_unqueue_deferred_split(folio); folio_ref_unfreeze(folio, expected_count); } /* No turning back from here */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; if (folio_test_anon(folio) && folio_test_large(folio)) mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); return 0; } oldzone = folio_zone(folio); newzone = folio_zone(newfolio); if (folio_test_swapcache(folio)) ci = swap_cluster_get_and_lock_irq(folio); else xas_lock_irq(&xas); if (!folio_ref_freeze(folio, expected_count)) { if (ci) swap_cluster_unlock_irq(ci); else xas_unlock_irq(&xas); return -EAGAIN; } /* Take off deferred split queue while frozen and memcg set */ folio_unqueue_deferred_split(folio); /* * Now we know that no one else is looking at the folio: * no turning back from here. */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; if (folio_test_anon(folio) && folio_test_large(folio)) mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); folio_ref_add(newfolio, nr); /* add cache reference */ if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); if (folio_test_swapcache(folio)) { folio_set_swapcache(newfolio); newfolio->private = folio_get_private(folio); } /* Move dirty while folio refs frozen and newfolio not yet exposed */ dirty = folio_test_dirty(folio); if (dirty) { folio_clear_dirty(folio); folio_set_dirty(newfolio); } if (folio_test_swapcache(folio)) __swap_cache_replace_folio(ci, folio, newfolio); else xas_store(&xas, newfolio); /* * Drop cache reference from old folio by unfreezing * to one less reference. * We know this isn't the last reference. */ folio_ref_unfreeze(folio, expected_count - nr); /* Leave irq disabled to prevent preemption while updating stats */ if (ci) swap_cluster_unlock(ci); else xas_unlock(&xas); /* * If moved to a different zone then also account * the folio for that zone. Other VM counters will be * taken care of when we establish references to the * new folio and drop references to the old folio. * * Note that anonymous folios are accounted for * via NR_FILE_PAGES and NR_ANON_MAPPED if they * are mapped to swap space. */ if (newzone != oldzone) { struct lruvec *old_lruvec, *new_lruvec; struct mem_cgroup *memcg; memcg = folio_memcg(folio); old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) { mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); mod_lruvec_state(new_lruvec, NR_SHMEM, nr); if (folio_test_pmd_mappable(folio)) { mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr); mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr); } } #ifdef CONFIG_SWAP if (folio_test_swapcache(folio)) { mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); } #endif if (dirty && mapping_can_writeback(mapping)) { mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } } local_irq_enable(); return 0; } int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count) { int expected_count = folio_expected_ref_count(folio) + extra_count + 1; if (folio_ref_count(folio) != expected_count) return -EAGAIN; return __folio_migrate_mapping(mapping, newfolio, folio, expected_count); } EXPORT_SYMBOL(folio_migrate_mapping); /* * The expected number of remaining references is the same as that * of folio_migrate_mapping(). */ int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) { XA_STATE(xas, &mapping->i_pages, src->index); int rc, expected_count = folio_expected_ref_count(src) + 1; if (folio_ref_count(src) != expected_count) return -EAGAIN; rc = folio_mc_copy(dst, src); if (unlikely(rc)) return rc; xas_lock_irq(&xas); if (!folio_ref_freeze(src, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; } dst->index = src->index; dst->mapping = src->mapping; folio_ref_add(dst, folio_nr_pages(dst)); xas_store(&xas, dst); folio_ref_unfreeze(src, expected_count - folio_nr_pages(src)); xas_unlock_irq(&xas); return 0; } /* * Copy the flags and some other ancillary information */ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) { int cpupid; if (folio_test_referenced(folio)) folio_set_referenced(newfolio); if (folio_test_uptodate(folio)) folio_mark_uptodate(newfolio); if (folio_test_clear_active(folio)) { VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio); folio_set_active(newfolio); } else if (folio_test_clear_unevictable(folio)) folio_set_unevictable(newfolio); if (folio_test_workingset(folio)) folio_set_workingset(newfolio); if (folio_test_checked(folio)) folio_set_checked(newfolio); /* * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via * migration entries. We can still have PG_anon_exclusive set on an * effectively unmapped and unreferenced first sub-pages of an * anonymous THP: we can simply copy it here via PG_mappedtodisk. */ if (folio_test_mappedtodisk(folio)) folio_set_mappedtodisk(newfolio); /* Move dirty on pages not done by folio_migrate_mapping() */ if (folio_test_dirty(folio)) folio_set_dirty(newfolio); if (folio_test_young(folio)) folio_set_young(newfolio); if (folio_test_idle(folio)) folio_set_idle(newfolio); folio_migrate_refs(newfolio, folio); /* * Copy NUMA information to the new page, to prevent over-eager * future migrations of this same page. */ cpupid = folio_xchg_last_cpupid(folio, -1); /* * For memory tiering mode, when migrate between slow and fast * memory node, reset cpupid, because that is used to record * page access time in slow memory node. */ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) { bool f_toptier = node_is_toptier(folio_nid(folio)); bool t_toptier = node_is_toptier(folio_nid(newfolio)); if (f_toptier != t_toptier) cpupid = -1; } folio_xchg_last_cpupid(newfolio, cpupid); folio_migrate_ksm(newfolio, folio); /* * Please do not reorder this without considering how mm/ksm.c's * ksm_get_folio() depends upon ksm_migrate_page() and the * swapcache flag. */ if (folio_test_swapcache(folio)) folio_clear_swapcache(folio); folio_clear_private(folio); /* page->private contains hugetlb specific flags */ if (!folio_test_hugetlb(folio)) folio->private = NULL; /* * If any waiters have accumulated on the new page then * wake them up. */ if (folio_test_writeback(newfolio)) folio_end_writeback(newfolio); /* * PG_readahead shares the same bit with PG_reclaim. The above * end_page_writeback() may clear PG_readahead mistakenly, so set the * bit after that. */ if (folio_test_readahead(folio)) folio_set_readahead(newfolio); folio_copy_owner(newfolio, folio); pgalloc_tag_swap(newfolio, folio); mem_cgroup_migrate(folio, newfolio); } EXPORT_SYMBOL(folio_migrate_flags); /************************************************************ * Migration functions ***********************************************************/ static int __migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, void *src_private, enum migrate_mode mode) { int rc, expected_count = folio_expected_ref_count(src) + 1; /* Check whether src does not have extra refs before we do more work */ if (folio_ref_count(src) != expected_count) return -EAGAIN; rc = folio_mc_copy(dst, src); if (unlikely(rc)) return rc; rc = __folio_migrate_mapping(mapping, dst, src, expected_count); if (rc) return rc; if (src_private) folio_attach_private(dst, folio_detach_private(src)); folio_migrate_flags(dst, src); return 0; } /** * migrate_folio() - Simple folio migration. * @mapping: The address_space containing the folio. * @dst: The folio to migrate the data to. * @src: The folio containing the current data. * @mode: How to migrate the page. * * Common logic to directly migrate a single LRU folio suitable for * folios that do not have private data. * * Folios are locked upon entry and exit. */ int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ return __migrate_folio(mapping, dst, src, NULL, mode); } EXPORT_SYMBOL(migrate_folio); #ifdef CONFIG_BUFFER_HEAD /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, enum migrate_mode mode) { struct buffer_head *bh = head; struct buffer_head *failed_bh; do { if (!trylock_buffer(bh)) { if (mode == MIGRATE_ASYNC) goto unlock; if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh)) goto unlock; lock_buffer(bh); } bh = bh->b_this_page; } while (bh != head); return true; unlock: /* We failed to lock the buffer and cannot stall. */ failed_bh = bh; bh = head; while (bh != failed_bh) { unlock_buffer(bh); bh = bh->b_this_page; } return false; } static int __buffer_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode, bool check_refs) { struct buffer_head *bh, *head; int rc; int expected_count; head = folio_buffers(src); if (!head) return migrate_folio(mapping, dst, src, mode); /* Check whether page does not have extra refs before we do more work */ expected_count = folio_expected_ref_count(src) + 1; if (folio_ref_count(src) != expected_count) return -EAGAIN; if (!buffer_migrate_lock_buffers(head, mode)) return -EAGAIN; if (check_refs) { bool busy, migrating; bool invalidated = false; migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state); VM_WARN_ON_ONCE(migrating); recheck_buffers: busy = false; spin_lock(&mapping->i_private_lock); bh = head; do { if (atomic_read(&bh->b_count)) { busy = true; break; } bh = bh->b_this_page; } while (bh != head); spin_unlock(&mapping->i_private_lock); if (busy) { if (invalidated) { rc = -EAGAIN; goto unlock_buffers; } invalidate_bh_lrus(); invalidated = true; goto recheck_buffers; } } rc = filemap_migrate_folio(mapping, dst, src, mode); if (rc) goto unlock_buffers; bh = head; do { folio_set_bh(bh, dst, bh_offset(bh)); bh = bh->b_this_page; } while (bh != head); unlock_buffers: if (check_refs) clear_bit_unlock(BH_Migrate, &head->b_state); bh = head; do { unlock_buffer(bh); bh = bh->b_this_page; } while (bh != head); return rc; } /** * buffer_migrate_folio() - Migration function for folios with buffers. * @mapping: The address space containing @src. * @dst: The folio to migrate to. * @src: The folio to migrate from. * @mode: How to migrate the folio. * * This function can only be used if the underlying filesystem guarantees * that no other references to @src exist. For example attached buffer * heads are accessed only under the folio lock. If your filesystem cannot * provide this guarantee, buffer_migrate_folio_norefs() may be more * appropriate. * * Return: 0 on success or a negative errno on failure. */ int buffer_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return __buffer_migrate_folio(mapping, dst, src, mode, false); } EXPORT_SYMBOL(buffer_migrate_folio); /** * buffer_migrate_folio_norefs() - Migration function for folios with buffers. * @mapping: The address space containing @src. * @dst: The folio to migrate to. * @src: The folio to migrate from. * @mode: How to migrate the folio. * * Like buffer_migrate_folio() except that this variant is more careful * and checks that there are also no buffer head references. This function * is the right one for mappings where buffer heads are directly looked * up and referenced (such as block device mappings). * * Return: 0 on success or a negative errno on failure. */ int buffer_migrate_folio_norefs(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return __buffer_migrate_folio(mapping, dst, src, mode, true); } EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); #endif /* CONFIG_BUFFER_HEAD */ int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return __migrate_folio(mapping, dst, src, folio_get_private(src), mode); } EXPORT_SYMBOL_GPL(filemap_migrate_folio); /* * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { WARN_ONCE(mapping->a_ops->writepages, "%ps does not implement migrate_folio\n", mapping->a_ops); if (folio_test_dirty(src)) return -EBUSY; /* * Filesystem may have private data at folio->private that we * can't migrate automatically. */ if (!filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; return migrate_folio(mapping, dst, src, mode); } /* * Move a src folio to a newly allocated dst folio. * * The src and dst folios are locked and the src folios was unmapped from * the page tables. * * On success, the src folio was replaced by the dst folio. * * Return value: * < 0 - error code * 0 - success */ static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) { struct address_space *mapping = folio_mapping(src); int rc = -EAGAIN; VM_BUG_ON_FOLIO(!folio_test_locked(src), src); VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); if (!mapping) rc = migrate_folio(mapping, dst, src, mode); else if (mapping_inaccessible(mapping)) rc = -EOPNOTSUPP; else if (mapping->a_ops->migrate_folio) /* * Most folios have a mapping and most filesystems * provide a migrate_folio callback. Anonymous folios * are part of swap space which also has its own * migrate_folio callback. This is the most common path * for page migration. */ rc = mapping->a_ops->migrate_folio(mapping, dst, src, mode); else rc = fallback_migrate_folio(mapping, dst, src, mode); if (!rc) { /* * For pagecache folios, src->mapping must be cleared before src * is freed. Anonymous folios must stay anonymous until freed. */ if (!folio_test_anon(src)) src->mapping = NULL; if (likely(!folio_is_zone_device(dst))) flush_dcache_folio(dst); } return rc; } /* * To record some information during migration, we use unused private * field of struct folio of the newly allocated destination folio. * This is safe because nobody is using it except us. */ enum { PAGE_WAS_MAPPED = BIT(0), PAGE_WAS_MLOCKED = BIT(1), PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED, }; static void __migrate_folio_record(struct folio *dst, int old_page_state, struct anon_vma *anon_vma) { dst->private = (void *)anon_vma + old_page_state; } static void __migrate_folio_extract(struct folio *dst, int *old_page_state, struct anon_vma **anon_vmap) { unsigned long private = (unsigned long)dst->private; *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES); *old_page_state = private & PAGE_OLD_STATES; dst->private = NULL; } /* Restore the source folio to the original state upon failure */ static void migrate_folio_undo_src(struct folio *src, int page_was_mapped, struct anon_vma *anon_vma, bool locked, struct list_head *ret) { if (page_was_mapped) remove_migration_ptes(src, src, 0); /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); if (locked) folio_unlock(src); if (ret) list_move_tail(&src->lru, ret); } /* Restore the destination folio to the original state upon failure */ static void migrate_folio_undo_dst(struct folio *dst, bool locked, free_folio_t put_new_folio, unsigned long private) { if (locked) folio_unlock(dst); if (put_new_folio) put_new_folio(dst, private); else folio_put(dst); } /* Cleanup src folio upon migration success */ static void migrate_folio_done(struct folio *src, enum migrate_reason reason) { if (likely(!page_has_movable_ops(&src->page)) && reason != MR_DEMOTION) mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + folio_is_file_lru(src), -folio_nr_pages(src)); if (reason != MR_MEMORY_FAILURE) /* We release the page in page_handle_poison. */ folio_put(src); } /* Obtain the lock on page, remove all ptes. */ static int migrate_folio_unmap(new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio **dstp, enum migrate_mode mode, struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; int old_page_state = 0; struct anon_vma *anon_vma = NULL; bool locked = false; bool dst_locked = false; dst = get_new_folio(src, private); if (!dst) return -ENOMEM; *dstp = dst; dst->private = NULL; if (!folio_trylock(src)) { if (mode == MIGRATE_ASYNC) goto out; /* * It's not safe for direct compaction to call lock_page. * For example, during page readahead pages are added locked * to the LRU. Later, when the IO completes the pages are * marked uptodate and unlocked. However, the queueing * could be merging multiple pages for one bio (e.g. * mpage_readahead). If an allocation happens for the * second or third page, the process can end up locking * the same page twice and deadlocking. Rather than * trying to be clever about what pages can be locked, * avoid the use of lock_page for direct compaction * altogether. */ if (current->flags & PF_MEMALLOC) goto out; /* * In "light" mode, we can wait for transient locks (eg * inserting a page into the page table), but it's not * worth waiting for I/O. */ if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src)) goto out; folio_lock(src); } locked = true; if (folio_test_mlocked(src)) old_page_state |= PAGE_WAS_MLOCKED; if (folio_test_writeback(src)) { /* * Only in the case of a full synchronous migration is it * necessary to wait for PageWriteback. In the async case, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much */ switch (mode) { case MIGRATE_SYNC: break; default: rc = -EBUSY; goto out; } folio_wait_writeback(src); } /* * By try_to_migrate(), src->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrate a page. * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. * * Only folio_get_anon_vma() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. * But if we cannot get anon_vma, then we won't need it anyway, * because that implies that the anon page is no longer mapped * (and cannot be remapped so long as we hold the page lock). */ if (folio_test_anon(src) && !folio_test_ksm(src)) anon_vma = folio_get_anon_vma(src); /* * Block others from accessing the new page when we get around to * establishing additional references. We are usually the only one * holding a reference to dst at this point. We used to have a BUG * here if folio_trylock(dst) fails, but would like to allow for * cases where there might be a race with the previous use of dst. * This is much like races on refcount of oldpage: just don't BUG(). */ if (unlikely(!folio_trylock(dst))) goto out; dst_locked = true; if (unlikely(page_has_movable_ops(&src->page))) { __migrate_folio_record(dst, old_page_state, anon_vma); return 0; } /* * Corner case handling: * 1. When a new swap-cache page is read into, it is added to the LRU * and treated as swapcache but it has no rmap yet. * Calling try_to_unmap() against a src->mapping==NULL page will * trigger a BUG. So handle it here. * 2. An orphaned page (see truncate_cleanup_page) might have * fs-private metadata. The page can be picked up due to memory * offlining. Everywhere else except page reclaim, the page is * invisible to the vm, so the page can not be migrated. So try to * free the metadata, so the page can be freed. */ if (!src->mapping) { if (folio_test_private(src)) { try_to_free_buffers(src); goto out; } } else if (folio_mapped(src)) { /* Establish migration ptes */ VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0); old_page_state |= PAGE_WAS_MAPPED; } if (!folio_mapped(src)) { __migrate_folio_record(dst, old_page_state, anon_vma); return 0; } out: /* * A folio that has not been unmapped will be restored to * right list unless we want to retry. */ if (rc == -EAGAIN) ret = NULL; migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, anon_vma, locked, ret); migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private); return rc; } /* Migrate the folio to the newly allocated folio in dst. */ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio *dst, enum migrate_mode mode, enum migrate_reason reason, struct list_head *ret) { int rc; int old_page_state = 0; struct anon_vma *anon_vma = NULL; struct list_head *prev; __migrate_folio_extract(dst, &old_page_state, &anon_vma); prev = dst->lru.prev; list_del(&dst->lru); if (unlikely(page_has_movable_ops(&src->page))) { rc = migrate_movable_ops_page(&dst->page, &src->page, mode); if (rc) goto out; goto out_unlock_both; } rc = move_to_new_folio(dst, src, mode); if (rc) goto out; /* * When successful, push dst to LRU immediately: so that if it * turns out to be an mlocked page, remove_migration_ptes() will * automatically build up the correct dst->mlock_count for it. * * We would like to do something similar for the old page, when * unsuccessful, and other cases when a page has been temporarily * isolated from the unevictable LRU: but this case is the easiest. */ folio_add_lru(dst); if (old_page_state & PAGE_WAS_MLOCKED) lru_add_drain(); if (old_page_state & PAGE_WAS_MAPPED) remove_migration_ptes(src, dst, 0); out_unlock_both: folio_unlock(dst); folio_set_owner_migrate_reason(dst, reason); /* * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased * refcounter. */ folio_put(dst); /* * A folio that has been migrated has all references removed * and will be freed. */ list_del(&src->lru); /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); folio_unlock(src); migrate_folio_done(src, reason); return rc; out: /* * A folio that has not been migrated will be restored to * right list unless we want to retry. */ if (rc == -EAGAIN) { list_add(&dst->lru, prev); __migrate_folio_record(dst, old_page_state, anon_vma); return rc; } migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, anon_vma, true, ret); migrate_folio_undo_dst(dst, true, put_new_folio, private); return rc; } /* * Counterpart of unmap_and_move_page() for hugepage migration. * * This function doesn't wait the completion of hugepage I/O * because there is no race between I/O and migration for hugepage. * Note that currently hugepage I/O occurs only in direct I/O * where no lock is held and PG_writeback is irrelevant, * and writeback status of all subpages are counted in the reference * count of the head page (i.e. if all subpages of a 2MB hugepage are * under direct I/O, the reference of the head page is 512 and a bit more.) * This means that when we try to migrate hugepage whose subpages are * doing direct I/O, some references remain after try_to_unmap() and * hugepage migration fails without data corruption. * * There is also no race when direct I/O is issued on the page under migration, * because then pte is replaced with migration swap entry and direct I/O code * will wait in the page fault for migration to complete. */ static int unmap_and_move_huge_page(new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, struct folio *src, int force, enum migrate_mode mode, int reason, struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ folio_putback_hugetlb(src); return 0; } dst = get_new_folio(src, private); if (!dst) return -ENOMEM; if (!folio_trylock(src)) { if (!force) goto out; switch (mode) { case MIGRATE_SYNC: break; default: goto out; } folio_lock(src); } /* * Check for pages which are in the process of being freed. Without * folio_mapping() set, hugetlbfs specific move page routine will not * be called and we could leak usage counts for subpools. */ if (hugetlb_folio_subpool(src) && !folio_mapping(src)) { rc = -EBUSY; goto out_unlock; } if (folio_test_anon(src)) anon_vma = folio_get_anon_vma(src); if (unlikely(!folio_trylock(dst))) goto put_anon; if (folio_mapped(src)) { enum ttu_flags ttu = 0; if (!folio_test_anon(src)) { /* * In shared mappings, try_to_unmap could potentially * call huge_pmd_unshare. Because of this, take * semaphore in write mode here and set TTU_RMAP_LOCKED * to let lower levels know we have taken the lock. */ mapping = hugetlb_folio_mapping_lock_write(src); if (unlikely(!mapping)) goto unlock_put_anon; ttu = TTU_RMAP_LOCKED; } try_to_migrate(src, ttu); page_was_mapped = 1; if (ttu & TTU_RMAP_LOCKED) i_mmap_unlock_write(mapping); } if (!folio_mapped(src)) rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) remove_migration_ptes(src, !rc ? dst : src, 0); unlock_put_anon: folio_unlock(dst); put_anon: if (anon_vma) put_anon_vma(anon_vma); if (!rc) { move_hugetlb_state(src, dst, reason); put_new_folio = NULL; } out_unlock: folio_unlock(src); out: if (!rc) folio_putback_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); /* * If migration was not successful and there's a freeing callback, * return the folio to that special allocator. Otherwise, simply drop * our additional reference. */ if (put_new_folio) put_new_folio(dst, private); else folio_put(dst); return rc; } static inline int try_split_folio(struct folio *folio, struct list_head *split_folios, enum migrate_mode mode) { int rc; if (mode == MIGRATE_ASYNC) { if (!folio_trylock(folio)) return -EAGAIN; } else { folio_lock(folio); } rc = split_folio_to_list(folio, split_folios); folio_unlock(folio); if (!rc) list_move_tail(&folio->lru, split_folios); return rc; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR #else #define NR_MAX_BATCHED_MIGRATION 512 #endif #define NR_MAX_MIGRATE_PAGES_RETRY 10 #define NR_MAX_MIGRATE_ASYNC_RETRY 3 #define NR_MAX_MIGRATE_SYNC_RETRY \ (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY) struct migrate_pages_stats { int nr_succeeded; /* Normal and large folios migrated successfully, in units of base pages */ int nr_failed_pages; /* Normal and large folios failed to be migrated, in units of base pages. Untried folios aren't counted */ int nr_thp_succeeded; /* THP migrated successfully */ int nr_thp_failed; /* THP failed to be migrated */ int nr_thp_split; /* THP split before migrating */ int nr_split; /* Large folio (include THP) split before migrating */ }; /* * Returns the number of hugetlb folios that were not migrated, or an error code * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable * any more because the list has become empty or no retryable hugetlb folios * exist any more. It is caller's responsibility to call putback_movable_pages() * only if ret != 0. */ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, struct migrate_pages_stats *stats, struct list_head *ret_folios) { int retry = 1; int nr_failed = 0; int nr_retry_pages = 0; int pass = 0; struct folio *folio, *folio2; int rc, nr_pages; for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) { retry = 0; nr_retry_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { if (!folio_test_hugetlb(folio)) continue; nr_pages = folio_nr_pages(folio); cond_resched(); /* * Migratability of hugepages depends on architectures and * their size. This check is necessary because some callers * of hugepage migration like soft offline and memory * hotremove don't walk through page tables or check whether * the hugepage is pmd-based or not before kicking migration. */ if (!hugepage_migration_supported(folio_hstate(folio))) { nr_failed++; stats->nr_failed_pages += nr_pages; list_move_tail(&folio->lru, ret_folios); continue; } rc = unmap_and_move_huge_page(get_new_folio, put_new_folio, private, folio, pass > 2, mode, reason, ret_folios); /* * The rules are: * 0: hugetlb folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list */ switch(rc) { case -ENOMEM: /* * When memory is low, don't bother to try to migrate * other folios, just exit. */ stats->nr_failed_pages += nr_pages + nr_retry_pages; return -ENOMEM; case -EAGAIN: retry++; nr_retry_pages += nr_pages; break; case 0: stats->nr_succeeded += nr_pages; break; default: /* * Permanent failure (-EBUSY, etc.): * unlike -EAGAIN case, the failed folio is * removed from migration folio list and not * retried in the next outer loop. */ nr_failed++; stats->nr_failed_pages += nr_pages; break; } } } /* * nr_failed is number of hugetlb folios failed to be migrated. After * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb * folios as failed. */ nr_failed += retry; stats->nr_failed_pages += nr_retry_pages; return nr_failed; } static void migrate_folios_move(struct list_head *src_folios, struct list_head *dst_folios, free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, struct list_head *ret_folios, struct migrate_pages_stats *stats, int *retry, int *thp_retry, int *nr_failed, int *nr_retry_pages) { struct folio *folio, *folio2, *dst, *dst2; bool is_thp; int nr_pages; int rc; dst = list_first_entry(dst_folios, struct folio, lru); dst2 = list_next_entry(dst, lru); list_for_each_entry_safe(folio, folio2, src_folios, lru) { is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); rc = migrate_folio_move(put_new_folio, private, folio, dst, mode, reason, ret_folios); /* * The rules are: * 0: folio will be freed * -EAGAIN: stay on the unmap_folios list * Other errno: put on ret_folios list */ switch (rc) { case -EAGAIN: *retry += 1; *thp_retry += is_thp; *nr_retry_pages += nr_pages; break; case 0: stats->nr_succeeded += nr_pages; stats->nr_thp_succeeded += is_thp; break; default: *nr_failed += 1; stats->nr_thp_failed += is_thp; stats->nr_failed_pages += nr_pages; break; } dst = dst2; dst2 = list_next_entry(dst, lru); } } static void migrate_folios_undo(struct list_head *src_folios, struct list_head *dst_folios, free_folio_t put_new_folio, unsigned long private, struct list_head *ret_folios) { struct folio *folio, *folio2, *dst, *dst2; dst = list_first_entry(dst_folios, struct folio, lru); dst2 = list_next_entry(dst, lru); list_for_each_entry_safe(folio, folio2, src_folios, lru) { int old_page_state = 0; struct anon_vma *anon_vma = NULL; __migrate_folio_extract(dst, &old_page_state, &anon_vma); migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, anon_vma, true, ret_folios); list_del(&dst->lru); migrate_folio_undo_dst(dst, true, put_new_folio, private); dst = dst2; dst2 = list_next_entry(dst, lru); } } /* * migrate_pages_batch() first unmaps folios in the from list as many as * possible, then move the unmapped folios. * * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a * lock or bit when we have locked more than one folio. Which may cause * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the * length of the from list must be <= 1. */ static int migrate_pages_batch(struct list_head *from, new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, struct list_head *ret_folios, struct list_head *split_folios, struct migrate_pages_stats *stats, int nr_pass) { int retry = 1; int thp_retry = 1; int nr_failed = 0; int nr_retry_pages = 0; int pass = 0; bool is_thp = false; bool is_large = false; struct folio *folio, *folio2, *dst = NULL; int rc, rc_saved = 0, nr_pages; LIST_HEAD(unmap_folios); LIST_HEAD(dst_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC && !list_empty(from) && !list_is_singular(from)); for (pass = 0; pass < nr_pass && retry; pass++) { retry = 0; thp_retry = 0; nr_retry_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { is_large = folio_test_large(folio); is_thp = folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); /* * The rare folio on the deferred split list should * be split now. It should not count as a failure: * but increment nr_failed because, without doing so, * migrate_pages() may report success with (split but * unmigrated) pages still on its fromlist; whereas it * always reports success when its fromlist is empty. * stats->nr_thp_failed should be increased too, * otherwise stats inconsistency will happen when * migrate_pages_batch is called via migrate_pages() * with MIGRATE_SYNC and MIGRATE_ASYNC. * * Only check it without removing it from the list. * Since the folio can be on deferred_split_scan() * local list and removing it can cause the local list * corruption. Folio split process below can handle it * with the help of folio_ref_freeze(). * * nr_pages > 2 is needed to avoid checking order-1 * page cache folios. They exist, in contrast to * non-existent order-1 anonymous folios, and do not * use _deferred_list. */ if (nr_pages > 2 && !list_empty(&folio->_deferred_list) && folio_test_partially_mapped(folio)) { if (!try_split_folio(folio, split_folios, mode)) { nr_failed++; stats->nr_thp_failed += is_thp; stats->nr_thp_split += is_thp; stats->nr_split++; continue; } } /* * Large folio migration might be unsupported or * the allocation might be failed so we should retry * on the same folio with the large folio split * to normal folios. * * Split folios are put in split_folios, and * we will migrate them after the rest of the * list is processed. */ if (!thp_migration_supported() && is_thp) { nr_failed++; stats->nr_thp_failed++; if (!try_split_folio(folio, split_folios, mode)) { stats->nr_thp_split++; stats->nr_split++; continue; } stats->nr_failed_pages += nr_pages; list_move_tail(&folio->lru, ret_folios); continue; } /* * If we are holding the last folio reference, the folio * was freed from under us, so just drop our reference. */ if (likely(!page_has_movable_ops(&folio->page)) && folio_ref_count(folio) == 1) { folio_clear_active(folio); folio_clear_unevictable(folio); list_del(&folio->lru); migrate_folio_done(folio, reason); stats->nr_succeeded += nr_pages; stats->nr_thp_succeeded += is_thp; continue; } rc = migrate_folio_unmap(get_new_folio, put_new_folio, private, folio, &dst, mode, ret_folios); /* * The rules are: * 0: folio will be put on unmap_folios list, * dst folio put on dst_folios list * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list */ switch(rc) { case -ENOMEM: /* * When memory is low, don't bother to try to migrate * other folios, move unmapped folios, then exit. */ nr_failed++; stats->nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ if (is_large && !nosplit) { int ret = try_split_folio(folio, split_folios, mode); if (!ret) { stats->nr_thp_split += is_thp; stats->nr_split++; break; } else if (reason == MR_LONGTERM_PIN && ret == -EAGAIN) { /* * Try again to split large folio to * mitigate the failure of longterm pinning. */ retry++; thp_retry += is_thp; nr_retry_pages += nr_pages; /* Undo duplicated failure counting. */ nr_failed--; stats->nr_thp_failed -= is_thp; break; } } stats->nr_failed_pages += nr_pages + nr_retry_pages; /* nr_failed isn't updated for not used */ stats->nr_thp_failed += thp_retry; rc_saved = rc; if (list_empty(&unmap_folios)) goto out; else goto move; case -EAGAIN: retry++; thp_retry += is_thp; nr_retry_pages += nr_pages; break; case 0: list_move_tail(&folio->lru, &unmap_folios); list_add_tail(&dst->lru, &dst_folios); break; default: /* * Permanent failure (-EBUSY, etc.): * unlike -EAGAIN case, the failed folio is * removed from migration folio list and not * retried in the next outer loop. */ nr_failed++; stats->nr_thp_failed += is_thp; stats->nr_failed_pages += nr_pages; break; } } } nr_failed += retry; stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; move: /* Flush TLBs for all unmapped folios */ try_to_unmap_flush(); retry = 1; for (pass = 0; pass < nr_pass && retry; pass++) { retry = 0; thp_retry = 0; nr_retry_pages = 0; /* Move the unmapped folios */ migrate_folios_move(&unmap_folios, &dst_folios, put_new_folio, private, mode, reason, ret_folios, stats, &retry, &thp_retry, &nr_failed, &nr_retry_pages); } nr_failed += retry; stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; rc = rc_saved ? : nr_failed; out: /* Cleanup remaining folios */ migrate_folios_undo(&unmap_folios, &dst_folios, put_new_folio, private, ret_folios); return rc; } static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, struct list_head *ret_folios, struct list_head *split_folios, struct migrate_pages_stats *stats) { int rc, nr_failed = 0; LIST_HEAD(folios); struct migrate_pages_stats astats; memset(&astats, 0, sizeof(astats)); /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */ rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC, reason, &folios, split_folios, &astats, NR_MAX_MIGRATE_ASYNC_RETRY); stats->nr_succeeded += astats.nr_succeeded; stats->nr_thp_succeeded += astats.nr_thp_succeeded; stats->nr_thp_split += astats.nr_thp_split; stats->nr_split += astats.nr_split; if (rc < 0) { stats->nr_failed_pages += astats.nr_failed_pages; stats->nr_thp_failed += astats.nr_thp_failed; list_splice_tail(&folios, ret_folios); return rc; } stats->nr_thp_failed += astats.nr_thp_split; /* * Do not count rc, as pages will be retried below. * Count nr_split only, since it includes nr_thp_split. */ nr_failed += astats.nr_split; /* * Fall back to migrate all failed folios one by one synchronously. All * failed folios except split THPs will be retried, so their failure * isn't counted */ list_splice_tail_init(&folios, from); while (!list_empty(from)) { list_move(from->next, &folios); rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, private, mode, reason, ret_folios, split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY); list_splice_tail_init(&folios, ret_folios); if (rc < 0) return rc; nr_failed += rc; } return nr_failed; } /* * migrate_pages - migrate the folios specified in a list, to the free folios * supplied as the target for the page migration * * @from: The list of folios to be migrated. * @get_new_folio: The function used to allocate free folios to be used * as the target of the folio migration. * @put_new_folio: The function used to free target folios if migration * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_folio() * @mode: The migration mode that specifies the constraints for * folio migration, if any. * @reason: The reason for folio migration. * @ret_succeeded: Set to the number of folios migrated successfully if * the caller passes a non-NULL pointer. * * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios * are movable any more because the list has become empty or no retryable folios * exist any more. It is caller's responsibility to call putback_movable_pages() * only if ret != 0. * * Returns the number of {normal folio, large folio, hugetlb} that were not * migrated, or an error code. The number of large folio splits will be * considered as the number of non-migrated large folio, no matter how many * split folios of the large folio are migrated successfully. */ int migrate_pages(struct list_head *from, new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int rc, rc_gather; int nr_pages; struct folio *folio, *folio2; LIST_HEAD(folios); LIST_HEAD(ret_folios); LIST_HEAD(split_folios); struct migrate_pages_stats stats; trace_mm_migrate_pages_start(mode, reason); memset(&stats, 0, sizeof(stats)); rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private, mode, reason, &stats, &ret_folios); if (rc_gather < 0) goto out; again: nr_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { /* Retried hugetlb folios will be kept in list */ if (folio_test_hugetlb(folio)) { list_move_tail(&folio->lru, &ret_folios); continue; } nr_pages += folio_nr_pages(folio); if (nr_pages >= NR_MAX_BATCHED_MIGRATION) break; } if (nr_pages >= NR_MAX_BATCHED_MIGRATION) list_cut_before(&folios, from, &folio2->lru); else list_splice_init(from, &folios); if (mode == MIGRATE_ASYNC) rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, private, mode, reason, &ret_folios, &split_folios, &stats, NR_MAX_MIGRATE_PAGES_RETRY); else rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio, private, mode, reason, &ret_folios, &split_folios, &stats); list_splice_tail_init(&folios, &ret_folios); if (rc < 0) { rc_gather = rc; list_splice_tail(&split_folios, &ret_folios); goto out; } if (!list_empty(&split_folios)) { /* * Failure isn't counted since all split folios of a large folio * is counted as 1 failure already. And, we only try to migrate * with minimal effort, force MIGRATE_ASYNC mode and retry once. */ migrate_pages_batch(&split_folios, get_new_folio, put_new_folio, private, MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1); list_splice_tail_init(&split_folios, &ret_folios); } rc_gather += rc; if (!list_empty(from)) goto again; out: /* * Put the permanent failure folio back to migration list, they * will be put back to the right list by the caller. */ list_splice(&ret_folios, from); /* * Return 0 in case all split folios of fail-to-migrate large folios * are migrated successfully. */ if (list_empty(from)) rc_gather = 0; count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded); count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded); count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed); count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split); trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages, stats.nr_thp_succeeded, stats.nr_thp_failed, stats.nr_thp_split, stats.nr_split, mode, reason); if (ret_succeeded) *ret_succeeded = stats.nr_succeeded; return rc_gather; } struct folio *alloc_migration_target(struct folio *src, unsigned long private) { struct migration_target_control *mtc; gfp_t gfp_mask; unsigned int order = 0; int nid; enum zone_type zidx; mtc = (struct migration_target_control *)private; gfp_mask = mtc->gfp_mask; nid = mtc->nid; if (nid == NUMA_NO_NODE) nid = folio_nid(src); if (folio_test_hugetlb(src)) { struct hstate *h = folio_hstate(src); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); return alloc_hugetlb_folio_nodemask(h, nid, mtc->nmask, gfp_mask, htlb_allow_alloc_fallback(mtc->reason)); } if (folio_test_large(src)) { /* * clear __GFP_RECLAIM to make the migration callback * consistent with regular THP allocations. */ gfp_mask &= ~__GFP_RECLAIM; gfp_mask |= GFP_TRANSHUGE; order = folio_order(src); } zidx = folio_zonenum(src); if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) gfp_mask |= __GFP_HIGHMEM; return __folio_alloc(gfp_mask, order, nid, mtc->nmask); } #ifdef CONFIG_NUMA static int store_status(int __user *status, int start, int value, int nr) { while (nr-- > 0) { if (put_user(value, status + start)) return -EFAULT; start++; } return 0; } static int do_move_pages_to_node(struct list_head *pagelist, int node) { int err; struct migration_target_control mtc = { .nid = node, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, .reason = MR_SYSCALL, }; err = migrate_pages(pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(pagelist); return err; } static int __add_folio_for_migration(struct folio *folio, int node, struct list_head *pagelist, bool migrate_all) { if (is_zero_folio(folio) || is_huge_zero_folio(folio)) return -EFAULT; if (folio_is_zone_device(folio)) return -ENOENT; if (folio_nid(folio) == node) return 0; if (folio_maybe_mapped_shared(folio) && !migrate_all) return -EACCES; if (folio_test_hugetlb(folio)) { if (folio_isolate_hugetlb(folio, pagelist)) return 1; } else if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, pagelist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); return 1; } return -EBUSY; } /* * Resolves the given address to a struct folio, isolates it from the LRU and * puts it to the given pagelist. * Returns: * errno - if the folio cannot be found/isolated * 0 - when it doesn't have to be migrated because it is already on the * target node * 1 - when it has been queued */ static int add_folio_for_migration(struct mm_struct *mm, const void __user *p, int node, struct list_head *pagelist, bool migrate_all) { struct vm_area_struct *vma; struct folio_walk fw; struct folio *folio; unsigned long addr; int err = -EFAULT; mmap_read_lock(mm); addr = (unsigned long)untagged_addr_remote(mm, p); vma = vma_lookup(mm, addr); if (vma && vma_migratable(vma)) { folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE); if (folio) { err = __add_folio_for_migration(folio, node, pagelist, migrate_all); folio_walk_end(&fw, vma); } else { err = -ENOENT; } } mmap_read_unlock(mm); return err; } static int move_pages_and_store_status(int node, struct list_head *pagelist, int __user *status, int start, int i, unsigned long nr_pages) { int err; if (list_empty(pagelist)) return 0; err = do_move_pages_to_node(pagelist, node); if (err) { /* * Positive err means the number of failed * pages to migrate. Since we are going to * abort and return the number of non-migrated * pages, so need to include the rest of the * nr_pages that have not been attempted as * well. */ if (err > 0) err += nr_pages - i; return err; } return store_status(status, start, node, i - start); } /* * Migrate an array of page address onto an array of nodes and fill * the corresponding array of status. */ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, unsigned long nr_pages, const void __user * __user *pages, const int __user *nodes, int __user *status, int flags) { compat_uptr_t __user *compat_pages = (void __user *)pages; int current_node = NUMA_NO_NODE; LIST_HEAD(pagelist); int start, i; int err = 0, err1; lru_cache_disable(); for (i = start = 0; i < nr_pages; i++) { const void __user *p; int node; err = -EFAULT; if (in_compat_syscall()) { compat_uptr_t cp; if (get_user(cp, compat_pages + i)) goto out_flush; p = compat_ptr(cp); } else { if (get_user(p, pages + i)) goto out_flush; } if (get_user(node, nodes + i)) goto out_flush; err = -ENODEV; if (node < 0 || node >= MAX_NUMNODES) goto out_flush; if (!node_state(node, N_MEMORY)) goto out_flush; err = -EACCES; if (!node_isset(node, task_nodes)) goto out_flush; if (current_node == NUMA_NO_NODE) { current_node = node; start = i; } else if (node != current_node) { err = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err) goto out; start = i; current_node = node; } /* * Errors in the page lookup or isolation are not fatal and we simply * report them via status */ err = add_folio_for_migration(mm, p, current_node, &pagelist, flags & MPOL_MF_MOVE_ALL); if (err > 0) { /* The page is successfully queued for migration */ continue; } /* * If the page is already on the target node (!err), store the * node, otherwise, store the err. */ err = store_status(status, i, err ? : current_node, 1); if (err) goto out_flush; err = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err) { /* We have accounted for page i */ if (err > 0) err--; goto out; } current_node = NUMA_NO_NODE; } out_flush: /* Make sure we do not overwrite the existing error */ err1 = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err >= 0) err = err1; out: lru_cache_enable(); return err; } /* * Determine the nodes of an array of pages and store it in an array of status. */ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, const void __user **pages, int *status) { unsigned long i; mmap_read_lock(mm); for (i = 0; i < nr_pages; i++) { unsigned long addr = (unsigned long)(*pages); struct vm_area_struct *vma; struct folio_walk fw; struct folio *folio; int err = -EFAULT; vma = vma_lookup(mm, addr); if (!vma) goto set_status; folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE); if (folio) { if (is_zero_folio(folio) || is_huge_zero_folio(folio)) err = -EFAULT; else if (folio_is_zone_device(folio)) err = -ENOENT; else err = folio_nid(folio); folio_walk_end(&fw, vma); } else { err = -ENOENT; } set_status: *status = err; pages++; status++; } mmap_read_unlock(mm); } static int get_compat_pages_array(const void __user *chunk_pages[], const void __user * __user *pages, unsigned long chunk_offset, unsigned long chunk_nr) { compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages; compat_uptr_t p; int i; for (i = 0; i < chunk_nr; i++) { if (get_user(p, pages32 + chunk_offset + i)) return -EFAULT; chunk_pages[i] = compat_ptr(p); } return 0; } /* * Determine the nodes of a user array of pages and store it in * a user array of status. */ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, const void __user * __user *pages, int __user *status) { #define DO_PAGES_STAT_CHUNK_NR 16UL const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; int chunk_status[DO_PAGES_STAT_CHUNK_NR]; unsigned long chunk_offset = 0; while (nr_pages) { unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR); if (in_compat_syscall()) { if (get_compat_pages_array(chunk_pages, pages, chunk_offset, chunk_nr)) break; } else { if (copy_from_user(chunk_pages, pages + chunk_offset, chunk_nr * sizeof(*chunk_pages))) break; } do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); if (copy_to_user(status + chunk_offset, chunk_status, chunk_nr * sizeof(*status))) break; chunk_offset += chunk_nr; nr_pages -= chunk_nr; } return nr_pages ? -EFAULT : 0; } static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) { struct task_struct *task; struct mm_struct *mm; /* * There is no need to check if current process has the right to modify * the specified process when they are same. */ if (!pid) { mmget(current->mm); *mem_nodes = cpuset_mems_allowed(current); return current->mm; } task = find_get_task_by_vpid(pid); if (!task) { return ERR_PTR(-ESRCH); } /* * Check if this process has the right to modify the specified * process. Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { mm = ERR_PTR(-EPERM); goto out; } mm = ERR_PTR(security_task_movememory(task)); if (IS_ERR(mm)) goto out; *mem_nodes = cpuset_mems_allowed(task); mm = get_task_mm(task); out: put_task_struct(task); if (!mm) mm = ERR_PTR(-EINVAL); return mm; } /* * Move a list of pages in the address space of the currently executing * process. */ static int kernel_move_pages(pid_t pid, unsigned long nr_pages, const void __user * __user *pages, const int __user *nodes, int __user *status, int flags) { struct mm_struct *mm; int err; nodemask_t task_nodes; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; mm = find_mm_struct(pid, &task_nodes); if (IS_ERR(mm)) return PTR_ERR(mm); if (nodes) err = do_pages_move(mm, task_nodes, nr_pages, pages, nodes, status, flags); else err = do_pages_stat(mm, nr_pages, pages, status); mmput(mm); return err; } SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, const void __user * __user *, pages, const int __user *, nodes, int __user *, status, int, flags) { return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); } #ifdef CONFIG_NUMA_BALANCING /* * Returns true if this is a safe migration target node for misplaced NUMA * pages. Currently it only checks the watermarks which is crude. */ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, unsigned long nr_migrate_pages) { int z; for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z; if (!managed_zone(zone)) continue; /* Avoid waking kswapd by allocating pages_to_migrate pages. */ if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + nr_migrate_pages, ZONE_MOVABLE, ALLOC_CMA)) continue; return true; } return false; } static struct folio *alloc_misplaced_dst_folio(struct folio *src, unsigned long data) { int nid = (int) data; int order = folio_order(src); gfp_t gfp = __GFP_THISNODE; if (order > 0) gfp |= GFP_TRANSHUGE_LIGHT; else { gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; gfp &= ~__GFP_RECLAIM; } return __folio_alloc_node(gfp, order, nid); } /* * Prepare for calling migrate_misplaced_folio() by isolating the folio if * permitted. Must be called with the PTL still held. */ int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node) { int nr_pages = folio_nr_pages(folio); pg_data_t *pgdat = NODE_DATA(node); if (folio_is_file_lru(folio)) { /* * Do not migrate file folios that are mapped in multiple * processes with execute permissions as they are probably * shared libraries. * * See folio_maybe_mapped_shared() on possible imprecision * when we cannot easily detect if a folio is shared. */ if ((vma->vm_flags & VM_EXEC) && folio_maybe_mapped_shared(folio)) return -EACCES; /* * Do not migrate dirty folios as not all filesystems can move * dirty folios in MIGRATE_ASYNC mode which is a waste of * cycles. */ if (folio_test_dirty(folio)) return -EAGAIN; } /* Avoid migrating to a node that is nearly full */ if (!migrate_balanced_pgdat(pgdat, nr_pages)) { int z; if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)) return -EAGAIN; for (z = pgdat->nr_zones - 1; z >= 0; z--) { if (managed_zone(pgdat->node_zones + z)) break; } /* * If there are no managed zones, it should not proceed * further. */ if (z < 0) return -EAGAIN; wakeup_kswapd(pgdat->node_zones + z, 0, folio_order(folio), ZONE_MOVABLE); return -EAGAIN; } if (!folio_isolate_lru(folio)) return -EAGAIN; node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), nr_pages); return 0; } /* * Attempt to migrate a misplaced folio to the specified destination * node. Caller is expected to have isolated the folio by calling * migrate_misplaced_folio_prepare(), which will result in an * elevated reference count on the folio. This function will un-isolate the * folio, dereferencing the folio before returning. */ int migrate_misplaced_folio(struct folio *folio, int node) { pg_data_t *pgdat = NODE_DATA(node); int nr_remaining; unsigned int nr_succeeded; LIST_HEAD(migratepages); struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); list_add(&folio->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, NULL, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED, &nr_succeeded); if (nr_remaining && !list_empty(&migratepages)) putback_movable_pages(&migratepages); if (nr_succeeded) { count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); count_memcg_events(memcg, NUMA_PAGE_MIGRATE, nr_succeeded); if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && !node_is_toptier(folio_nid(folio)) && node_is_toptier(node)) mod_lruvec_state(lruvec, PGPROMOTE_SUCCESS, nr_succeeded); } mem_cgroup_put(memcg); BUG_ON(!list_empty(&migratepages)); return nr_remaining ? -EAGAIN : 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 /* SPDX-License-Identifier: GPL-2.0 */ /* * ethtool.h: Defines for Linux ethtool. * * Copyright (C) 1998 David S. Miller (davem@redhat.com) * Copyright 2001 Jeff Garzik <jgarzik@pobox.com> * Portions Copyright 2001 Sun Microsystems (thockin@sun.com) * Portions Copyright 2002 Intel (eli.kupermann@intel.com, * christopher.leech@intel.com, * scott.feldman@intel.com) * Portions Copyright (C) Sun Microsystems 2008 */ #ifndef _LINUX_ETHTOOL_H #define _LINUX_ETHTOOL_H #include <linux/bitmap.h> #include <linux/compat.h> #include <linux/if_ether.h> #include <linux/netlink.h> #include <linux/timer_types.h> #include <uapi/linux/ethtool.h> #include <uapi/linux/ethtool_netlink_generated.h> #include <uapi/linux/net_tstamp.h> #define ETHTOOL_MM_MAX_VERIFY_TIME_MS 128 #define ETHTOOL_MM_MAX_VERIFY_RETRIES 3 struct compat_ethtool_rx_flow_spec { u32 flow_type; union ethtool_flow_union h_u; struct ethtool_flow_ext h_ext; union ethtool_flow_union m_u; struct ethtool_flow_ext m_ext; compat_u64 ring_cookie; u32 location; }; struct compat_ethtool_rxnfc { u32 cmd; u32 flow_type; compat_u64 data; struct compat_ethtool_rx_flow_spec fs; u32 rule_cnt; u32 rule_locs[]; }; #include <linux/rculist.h> /** * enum ethtool_phys_id_state - indicator state for physical identification * @ETHTOOL_ID_INACTIVE: Physical ID indicator should be deactivated * @ETHTOOL_ID_ACTIVE: Physical ID indicator should be activated * @ETHTOOL_ID_ON: LED should be turned on (used iff %ETHTOOL_ID_ACTIVE * is not supported) * @ETHTOOL_ID_OFF: LED should be turned off (used iff %ETHTOOL_ID_ACTIVE * is not supported) */ enum ethtool_phys_id_state { ETHTOOL_ID_INACTIVE, ETHTOOL_ID_ACTIVE, ETHTOOL_ID_ON, ETHTOOL_ID_OFF }; enum { ETH_RSS_HASH_TOP_BIT, /* Configurable RSS hash function - Toeplitz */ ETH_RSS_HASH_XOR_BIT, /* Configurable RSS hash function - Xor */ ETH_RSS_HASH_CRC32_BIT, /* Configurable RSS hash function - Crc32 */ /* * Add your fresh new hash function bits above and remember to update * rss_hash_func_strings[] in ethtool.c */ ETH_RSS_HASH_FUNCS_COUNT }; /** * struct kernel_ethtool_ringparam - RX/TX ring configuration * @rx_buf_len: Current length of buffers on the rx ring. * @tcp_data_split: Scatter packet headers and data to separate buffers * @tx_push: The flag of tx push mode * @rx_push: The flag of rx push mode * @cqe_size: Size of TX/RX completion queue event * @tx_push_buf_len: Size of TX push buffer * @tx_push_buf_max_len: Maximum allowed size of TX push buffer * @hds_thresh: Packet size threshold for header data split (HDS) * @hds_thresh_max: Maximum supported setting for @hds_threshold * */ struct kernel_ethtool_ringparam { u32 rx_buf_len; u8 tcp_data_split; u8 tx_push; u8 rx_push; u32 cqe_size; u32 tx_push_buf_len; u32 tx_push_buf_max_len; u32 hds_thresh; u32 hds_thresh_max; }; /** * enum ethtool_supported_ring_param - indicator caps for setting ring params * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len * @ETHTOOL_RING_USE_CQE_SIZE: capture for setting cqe_size * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len * @ETHTOOL_RING_USE_TCP_DATA_SPLIT: capture for setting tcp_data_split * @ETHTOOL_RING_USE_HDS_THRS: capture for setting header-data-split-thresh */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), ETHTOOL_RING_USE_CQE_SIZE = BIT(1), ETHTOOL_RING_USE_TX_PUSH = BIT(2), ETHTOOL_RING_USE_RX_PUSH = BIT(3), ETHTOOL_RING_USE_TX_PUSH_BUF_LEN = BIT(4), ETHTOOL_RING_USE_TCP_DATA_SPLIT = BIT(5), ETHTOOL_RING_USE_HDS_THRS = BIT(6), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) #define __ETH_RSS_HASH(name) __ETH_RSS_HASH_BIT(ETH_RSS_HASH_##name##_BIT) #define ETH_RSS_HASH_TOP __ETH_RSS_HASH(TOP) #define ETH_RSS_HASH_XOR __ETH_RSS_HASH(XOR) #define ETH_RSS_HASH_CRC32 __ETH_RSS_HASH(CRC32) #define ETH_RSS_HASH_UNKNOWN 0 #define ETH_RSS_HASH_NO_CHANGE 0 struct net_device; struct netlink_ext_ack; /* Link extended state and substate. */ struct ethtool_link_ext_state_info { enum ethtool_link_ext_state link_ext_state; union { enum ethtool_link_ext_substate_autoneg autoneg; enum ethtool_link_ext_substate_link_training link_training; enum ethtool_link_ext_substate_link_logical_mismatch link_logical_mismatch; enum ethtool_link_ext_substate_bad_signal_integrity bad_signal_integrity; enum ethtool_link_ext_substate_cable_issue cable_issue; enum ethtool_link_ext_substate_module module; u32 __link_ext_substate; }; }; struct ethtool_link_ext_stats { /* Custom Linux statistic for PHY level link down events. * In a simpler world it should be equal to netdev->carrier_down_count * unfortunately netdev also counts local reconfigurations which don't * actually take the physical link down, not to mention NC-SI which, * if present, keeps the link up regardless of host state. * This statistic counts when PHY _actually_ went down, or lost link. * * Note that we need u64 for ethtool_stats_init() and comparisons * to ETHTOOL_STAT_NOT_SET, but only u32 is exposed to the user. */ u64 link_down_events; }; /** * ethtool_rxfh_indir_default - get default value for RX flow hash indirection * @index: Index in RX flow hash indirection table * @n_rx_rings: Number of RX rings to use * * This function provides the default policy for RX flow hash indirection. */ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) { return index % n_rx_rings; } /** * struct ethtool_rxfh_context - a custom RSS context configuration * @indir_size: Number of u32 entries in indirection table * @key_size: Size of hash key, in bytes * @priv_size: Size of driver private data, in bytes * @hfunc: RSS hash function identifier. One of the %ETH_RSS_HASH_* * @input_xfrm: Defines how the input data is transformed. Valid values are one * of %RXH_XFRM_*. * @indir_configured: indir has been specified (at create time or subsequently) * @key_configured: hkey has been specified (at create time or subsequently) */ struct ethtool_rxfh_context { u32 indir_size; u32 key_size; u16 priv_size; u8 hfunc; u8 input_xfrm; u8 indir_configured:1; u8 key_configured:1; /* private: driver private data, indirection table, and hash key are * stored sequentially in @data area. Use below helpers to access. */ u32 key_off; u8 data[] __aligned(sizeof(void *)); }; static inline void *ethtool_rxfh_context_priv(struct ethtool_rxfh_context *ctx) { return ctx->data; } static inline u32 *ethtool_rxfh_context_indir(struct ethtool_rxfh_context *ctx) { return (u32 *)(ctx->data + ALIGN(ctx->priv_size, sizeof(u32))); } static inline u8 *ethtool_rxfh_context_key(struct ethtool_rxfh_context *ctx) { return &ctx->data[ctx->key_off]; } void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id); struct link_mode_info { int speed; u8 lanes; u8 duplex; }; extern const struct link_mode_info link_mode_params[]; /* declare a link mode bitmap */ #define __ETHTOOL_DECLARE_LINK_MODE_MASK(name) \ DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS) /* drivers must ignore base.cmd and base.link_mode_masks_nwords * fields, but they are allowed to overwrite them (will be ignored). */ struct ethtool_link_ksettings { struct ethtool_link_settings base; struct { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising); } link_modes; u32 lanes; }; /** * ethtool_link_ksettings_zero_link_mode - clear link_ksettings link mode mask * @ptr : pointer to struct ethtool_link_ksettings * @name : one of supported/advertising/lp_advertising */ #define ethtool_link_ksettings_zero_link_mode(ptr, name) \ bitmap_zero((ptr)->link_modes.name, __ETHTOOL_LINK_MODE_MASK_NBITS) /** * ethtool_link_ksettings_add_link_mode - set bit in link_ksettings * link mode mask * @ptr : pointer to struct ethtool_link_ksettings * @name : one of supported/advertising/lp_advertising * @mode : one of the ETHTOOL_LINK_MODE_*_BIT * (not atomic, no bound checking) */ #define ethtool_link_ksettings_add_link_mode(ptr, name, mode) \ __set_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) /** * ethtool_link_ksettings_del_link_mode - clear bit in link_ksettings * link mode mask * @ptr : pointer to struct ethtool_link_ksettings * @name : one of supported/advertising/lp_advertising * @mode : one of the ETHTOOL_LINK_MODE_*_BIT * (not atomic, no bound checking) */ #define ethtool_link_ksettings_del_link_mode(ptr, name, mode) \ __clear_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) /** * ethtool_link_ksettings_test_link_mode - test bit in ksettings link mode mask * @ptr : pointer to struct ethtool_link_ksettings * @name : one of supported/advertising/lp_advertising * @mode : one of the ETHTOOL_LINK_MODE_*_BIT * (not atomic, no bound checking) * * Returns: true/false. */ #define ethtool_link_ksettings_test_link_mode(ptr, name, mode) \ test_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) extern int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings); struct ethtool_keee { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertised); __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertised); u32 tx_lpi_timer; bool tx_lpi_enabled; bool eee_active; bool eee_enabled; }; struct kernel_ethtool_coalesce { u8 use_cqe_mode_tx; u8 use_cqe_mode_rx; u32 tx_aggr_max_bytes; u32 tx_aggr_max_frames; u32 tx_aggr_time_usecs; }; /** * ethtool_intersect_link_masks - Given two link masks, AND them together * @dst: first mask and where result is stored * @src: second mask to intersect with * * Given two link mode masks, AND them together and save the result in dst. */ void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst, struct ethtool_link_ksettings *src); void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32); /* return false if src had higher bits set. lower bits always updated. */ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, const unsigned long *src); #define ETHTOOL_COALESCE_RX_USECS BIT(0) #define ETHTOOL_COALESCE_RX_MAX_FRAMES BIT(1) #define ETHTOOL_COALESCE_RX_USECS_IRQ BIT(2) #define ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ BIT(3) #define ETHTOOL_COALESCE_TX_USECS BIT(4) #define ETHTOOL_COALESCE_TX_MAX_FRAMES BIT(5) #define ETHTOOL_COALESCE_TX_USECS_IRQ BIT(6) #define ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ BIT(7) #define ETHTOOL_COALESCE_STATS_BLOCK_USECS BIT(8) #define ETHTOOL_COALESCE_USE_ADAPTIVE_RX BIT(9) #define ETHTOOL_COALESCE_USE_ADAPTIVE_TX BIT(10) #define ETHTOOL_COALESCE_PKT_RATE_LOW BIT(11) #define ETHTOOL_COALESCE_RX_USECS_LOW BIT(12) #define ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW BIT(13) #define ETHTOOL_COALESCE_TX_USECS_LOW BIT(14) #define ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW BIT(15) #define ETHTOOL_COALESCE_PKT_RATE_HIGH BIT(16) #define ETHTOOL_COALESCE_RX_USECS_HIGH BIT(17) #define ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH BIT(18) #define ETHTOOL_COALESCE_TX_USECS_HIGH BIT(19) #define ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH BIT(20) #define ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL BIT(21) #define ETHTOOL_COALESCE_USE_CQE_RX BIT(22) #define ETHTOOL_COALESCE_USE_CQE_TX BIT(23) #define ETHTOOL_COALESCE_TX_AGGR_MAX_BYTES BIT(24) #define ETHTOOL_COALESCE_TX_AGGR_MAX_FRAMES BIT(25) #define ETHTOOL_COALESCE_TX_AGGR_TIME_USECS BIT(26) #define ETHTOOL_COALESCE_RX_PROFILE BIT(27) #define ETHTOOL_COALESCE_TX_PROFILE BIT(28) #define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(28, 0) #define ETHTOOL_COALESCE_USECS \ (ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS) #define ETHTOOL_COALESCE_MAX_FRAMES \ (ETHTOOL_COALESCE_RX_MAX_FRAMES | ETHTOOL_COALESCE_TX_MAX_FRAMES) #define ETHTOOL_COALESCE_USECS_IRQ \ (ETHTOOL_COALESCE_RX_USECS_IRQ | ETHTOOL_COALESCE_TX_USECS_IRQ) #define ETHTOOL_COALESCE_MAX_FRAMES_IRQ \ (ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ | \ ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ) #define ETHTOOL_COALESCE_USE_ADAPTIVE \ (ETHTOOL_COALESCE_USE_ADAPTIVE_RX | ETHTOOL_COALESCE_USE_ADAPTIVE_TX) #define ETHTOOL_COALESCE_USECS_LOW_HIGH \ (ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_TX_USECS_LOW | \ ETHTOOL_COALESCE_RX_USECS_HIGH | ETHTOOL_COALESCE_TX_USECS_HIGH) #define ETHTOOL_COALESCE_MAX_FRAMES_LOW_HIGH \ (ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW | \ ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW | \ ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH | \ ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH) #define ETHTOOL_COALESCE_PKT_RATE_RX_USECS \ (ETHTOOL_COALESCE_USE_ADAPTIVE_RX | \ ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_RX_USECS_HIGH | \ ETHTOOL_COALESCE_PKT_RATE_LOW | ETHTOOL_COALESCE_PKT_RATE_HIGH | \ ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL) #define ETHTOOL_COALESCE_USE_CQE \ (ETHTOOL_COALESCE_USE_CQE_RX | ETHTOOL_COALESCE_USE_CQE_TX) #define ETHTOOL_COALESCE_TX_AGGR \ (ETHTOOL_COALESCE_TX_AGGR_MAX_BYTES | \ ETHTOOL_COALESCE_TX_AGGR_MAX_FRAMES | \ ETHTOOL_COALESCE_TX_AGGR_TIME_USECS) #define ETHTOOL_STAT_NOT_SET (~0ULL) static inline void ethtool_stats_init(u64 *stats, unsigned int n) { while (n--) stats[n] = ETHTOOL_STAT_NOT_SET; } /* Basic IEEE 802.3 MAC statistics (30.3.1.1.*), not otherwise exposed * via a more targeted API. */ struct ethtool_eth_mac_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 FramesTransmittedOK; u64 SingleCollisionFrames; u64 MultipleCollisionFrames; u64 FramesReceivedOK; u64 FrameCheckSequenceErrors; u64 AlignmentErrors; u64 OctetsTransmittedOK; u64 FramesWithDeferredXmissions; u64 LateCollisions; u64 FramesAbortedDueToXSColls; u64 FramesLostDueToIntMACXmitError; u64 CarrierSenseErrors; u64 OctetsReceivedOK; u64 FramesLostDueToIntMACRcvError; u64 MulticastFramesXmittedOK; u64 BroadcastFramesXmittedOK; u64 FramesWithExcessiveDeferral; u64 MulticastFramesReceivedOK; u64 BroadcastFramesReceivedOK; u64 InRangeLengthErrors; u64 OutOfRangeLengthField; u64 FrameTooLongErrors; ); }; /* Basic IEEE 802.3 PHY statistics (30.3.2.1.*), not otherwise exposed * via a more targeted API. */ struct ethtool_eth_phy_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 SymbolErrorDuringCarrier; ); }; /** * struct ethtool_phy_stats - PHY-level statistics counters * @rx_packets: Total successfully received frames * @rx_bytes: Total successfully received bytes * @rx_errors: Total received frames with errors (e.g., CRC errors) * @tx_packets: Total successfully transmitted frames * @tx_bytes: Total successfully transmitted bytes * @tx_errors: Total transmitted frames with errors * * This structure provides a standardized interface for reporting * PHY-level statistics counters. It is designed to expose statistics * commonly provided by PHYs but not explicitly defined in the IEEE * 802.3 standard. */ struct ethtool_phy_stats { u64 rx_packets; u64 rx_bytes; u64 rx_errors; u64 tx_packets; u64 tx_bytes; u64 tx_errors; }; /* Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*), not otherwise exposed * via a more targeted API. */ struct ethtool_eth_ctrl_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 MACControlFramesTransmitted; u64 MACControlFramesReceived; u64 UnsupportedOpcodesReceived; ); }; /** * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames * @src: input field denoting whether stats should be queried from the eMAC or * pMAC (if the MM layer is supported). To be ignored otherwise. * @tx_pause_frames: transmitted pause frame count. Reported to user space * as %ETHTOOL_A_PAUSE_STAT_TX_FRAMES. * * Equivalent to `30.3.4.2 aPAUSEMACCtrlFramesTransmitted` * from the standard. * * @rx_pause_frames: received pause frame count. Reported to user space * as %ETHTOOL_A_PAUSE_STAT_RX_FRAMES. Equivalent to: * * Equivalent to `30.3.4.3 aPAUSEMACCtrlFramesReceived` * from the standard. */ struct ethtool_pause_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 tx_pause_frames; u64 rx_pause_frames; ); }; #define ETHTOOL_MAX_LANES 8 /* * IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for * the end-of-list marker, total 17 items */ #define ETHTOOL_FEC_HIST_MAX 17 /** * struct ethtool_fec_hist_range - error bits range for FEC histogram * statistics * @low: low bound of the bin (inclusive) * @high: high bound of the bin (inclusive) */ struct ethtool_fec_hist_range { u16 low; u16 high; }; struct ethtool_fec_hist { struct ethtool_fec_hist_value { u64 sum; u64 per_lane[ETHTOOL_MAX_LANES]; } values[ETHTOOL_FEC_HIST_MAX]; const struct ethtool_fec_hist_range *ranges; }; /** * struct ethtool_fec_stats - statistics for IEEE 802.3 FEC * @corrected_blocks: number of received blocks corrected by FEC * Reported to user space as %ETHTOOL_A_FEC_STAT_CORRECTED. * * Equivalent to `30.5.1.1.17 aFECCorrectedBlocks` from the standard. * * @uncorrectable_blocks: number of received blocks FEC was not able to correct * Reported to user space as %ETHTOOL_A_FEC_STAT_UNCORR. * * Equivalent to `30.5.1.1.18 aFECUncorrectableBlocks` from the standard. * * @corrected_bits: number of bits corrected by FEC * Similar to @corrected_blocks but counts individual bit changes, * not entire FEC data blocks. This is a non-standard statistic. * Reported to user space as %ETHTOOL_A_FEC_STAT_CORR_BITS. * * For each of the above fields, the two substructure members are: * * - @lanes: per-lane/PCS-instance counts as defined by the standard * - @total: error counts for the entire port, for drivers incapable of reporting * per-lane stats * * Drivers should fill in either only total or per-lane statistics, core * will take care of adding lane values up to produce the total. */ struct ethtool_fec_stats { struct ethtool_fec_stat { u64 total; u64 lanes[ETHTOOL_MAX_LANES]; } corrected_blocks, uncorrectable_blocks, corrected_bits; }; /** * struct ethtool_rmon_hist_range - byte range for histogram statistics * @low: low bound of the bucket (inclusive) * @high: high bound of the bucket (inclusive) */ struct ethtool_rmon_hist_range { u16 low; u16 high; }; #define ETHTOOL_RMON_HIST_MAX 11 /** * struct ethtool_rmon_stats - selected RMON (RFC 2819) statistics * @src: input field denoting whether stats should be queried from the eMAC or * pMAC (if the MM layer is supported). To be ignored otherwise. * @undersize_pkts: Equivalent to `etherStatsUndersizePkts` from the RFC. * @oversize_pkts: Equivalent to `etherStatsOversizePkts` from the RFC. * @fragments: Equivalent to `etherStatsFragments` from the RFC. * @jabbers: Equivalent to `etherStatsJabbers` from the RFC. * @hist: Packet counter for packet length buckets (e.g. * `etherStatsPkts128to255Octets` from the RFC). * @hist_tx: Tx counters in similar form to @hist, not defined in the RFC. * * Selection of RMON (RFC 2819) statistics which are not exposed via different * APIs, primarily the packet-length-based counters. * Unfortunately different designs choose different buckets beyond * the 1024B mark (jumbo frame teritory), so the definition of the bucket * ranges is left to the driver. */ struct ethtool_rmon_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 undersize_pkts; u64 oversize_pkts; u64 fragments; u64 jabbers; u64 hist[ETHTOOL_RMON_HIST_MAX]; u64 hist_tx[ETHTOOL_RMON_HIST_MAX]; ); }; /** * struct ethtool_ts_stats - HW timestamping statistics * @pkts: Number of packets successfully timestamped by the hardware. * @onestep_pkts_unconfirmed: Number of PTP packets with one-step TX * timestamping that were sent, but for which the * device offers no confirmation whether they made * it onto the wire and the timestamp was inserted * in the originTimestamp or correctionField, or * not. * @lost: Number of hardware timestamping requests where the timestamping * information from the hardware never arrived for submission with * the skb. * @err: Number of arbitrary timestamp generation error events that the * hardware encountered, exclusive of @lost statistics. Cases such * as resource exhaustion, unavailability, firmware errors, and * detected illogical timestamp values not submitted with the skb * are inclusive to this counter. */ struct ethtool_ts_stats { struct_group(tx_stats, u64 pkts; u64 onestep_pkts_unconfirmed; u64 lost; u64 err; ); }; #define ETH_MODULE_EEPROM_PAGE_LEN 128 #define ETH_MODULE_MAX_I2C_ADDRESS 0x7f /** * struct ethtool_module_eeprom - plug-in module EEPROM read / write parameters * @offset: When @offset is 0-127, it is used as an address to the Lower Memory * (@page must be 0). Otherwise, it is used as an address to the * Upper Memory. * @length: Number of bytes to read / write. * @page: Page number. * @bank: Bank number, if supported by EEPROM spec. * @i2c_address: I2C address of a page. Value less than 0x7f expected. Most * EEPROMs use 0x50 or 0x51. * @data: Pointer to buffer with EEPROM data of @length size. */ struct ethtool_module_eeprom { u32 offset; u32 length; u8 page; u8 bank; u8 i2c_address; u8 *data; }; /** * struct ethtool_module_power_mode_params - module power mode parameters * @policy: The power mode policy enforced by the host for the plug-in module. * @mode: The operational power mode of the plug-in module. Should be filled by * device drivers on get operations. */ struct ethtool_module_power_mode_params { enum ethtool_module_power_mode_policy policy; enum ethtool_module_power_mode mode; }; /** * struct ethtool_mm_state - 802.3 MAC merge layer state * @verify_time: * wait time between verification attempts in ms (according to clause * 30.14.1.6 aMACMergeVerifyTime) * @max_verify_time: * maximum accepted value for the @verify_time variable in set requests * @verify_status: * state of the verification state machine of the MM layer (according to * clause 30.14.1.2 aMACMergeStatusVerify) * @tx_enabled: * set if the MM layer is administratively enabled in the TX direction * (according to clause 30.14.1.3 aMACMergeEnableTx) * @tx_active: * set if the MM layer is enabled in the TX direction, which makes FP * possible (according to 30.14.1.5 aMACMergeStatusTx). This should be * true if MM is enabled, and the verification status is either verified, * or disabled. * @pmac_enabled: * set if the preemptible MAC is powered on and is able to receive * preemptible packets and respond to verification frames. * @verify_enabled: * set if the Verify function of the MM layer (which sends SMD-V * verification requests) is administratively enabled (regardless of * whether it is currently in the ETHTOOL_MM_VERIFY_STATUS_DISABLED state * or not), according to clause 30.14.1.4 aMACMergeVerifyDisableTx (but * using positive rather than negative logic). The device should always * respond to received SMD-V requests as long as @pmac_enabled is set. * @tx_min_frag_size: * the minimum size of non-final mPacket fragments that the link partner * supports receiving, expressed in octets. Compared to the definition * from clause 30.14.1.7 aMACMergeAddFragSize which is expressed in the * range 0 to 3 (requiring a translation to the size in octets according * to the formula 64 * (1 + addFragSize) - 4), a value in a continuous and * unbounded range can be specified here. * @rx_min_frag_size: * the minimum size of non-final mPacket fragments that this device * supports receiving, expressed in octets. */ struct ethtool_mm_state { u32 verify_time; u32 max_verify_time; enum ethtool_mm_verify_status verify_status; bool tx_enabled; bool tx_active; bool pmac_enabled; bool verify_enabled; u32 tx_min_frag_size; u32 rx_min_frag_size; }; /** * struct ethtool_mm_cfg - 802.3 MAC merge layer configuration * @verify_time: see struct ethtool_mm_state * @verify_enabled: see struct ethtool_mm_state * @tx_enabled: see struct ethtool_mm_state * @pmac_enabled: see struct ethtool_mm_state * @tx_min_frag_size: see struct ethtool_mm_state */ struct ethtool_mm_cfg { u32 verify_time; bool verify_enabled; bool tx_enabled; bool pmac_enabled; u32 tx_min_frag_size; }; /** * struct ethtool_mm_stats - 802.3 MAC merge layer statistics * @MACMergeFrameAssErrorCount: * received MAC frames with reassembly errors * @MACMergeFrameSmdErrorCount: * received MAC frames/fragments rejected due to unknown or incorrect SMD * @MACMergeFrameAssOkCount: * received MAC frames that were successfully reassembled and passed up * @MACMergeFragCountRx: * number of additional correct SMD-C mPackets received due to preemption * @MACMergeFragCountTx: * number of additional mPackets sent due to preemption * @MACMergeHoldCount: * number of times the MM layer entered the HOLD state, which blocks * transmission of preemptible traffic */ struct ethtool_mm_stats { u64 MACMergeFrameAssErrorCount; u64 MACMergeFrameSmdErrorCount; u64 MACMergeFrameAssOkCount; u64 MACMergeFragCountRx; u64 MACMergeFragCountTx; u64 MACMergeHoldCount; }; enum ethtool_mmsv_event { ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET, ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET, ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET, }; /* MAC Merge verification mPacket type */ enum ethtool_mpacket { ETHTOOL_MPACKET_VERIFY, ETHTOOL_MPACKET_RESPONSE, }; struct ethtool_mmsv; /** * struct ethtool_mmsv_ops - Operations for MAC Merge Software Verification * @configure_tx: Driver callback for the event where the preemptible TX * becomes active or inactive. Preemptible traffic * classes must be committed to hardware only while * preemptible TX is active. * @configure_pmac: Driver callback for the event where the pMAC state * changes as result of an administrative setting * (ethtool) or a call to ethtool_mmsv_link_state_handle(). * @send_mpacket: Driver-provided method for sending a Verify or a Response * mPacket. */ struct ethtool_mmsv_ops { void (*configure_tx)(struct ethtool_mmsv *mmsv, bool tx_active); void (*configure_pmac)(struct ethtool_mmsv *mmsv, bool pmac_enabled); void (*send_mpacket)(struct ethtool_mmsv *mmsv, enum ethtool_mpacket mpacket); }; /** * struct ethtool_mmsv - MAC Merge Software Verification * @ops: operations for MAC Merge Software Verification * @dev: pointer to net_device structure * @lock: serialize access to MAC Merge state between * ethtool requests and link state updates. * @status: current verification FSM state * @verify_timer: timer for verification in local TX direction * @verify_enabled: indicates if verification is enabled * @verify_retries: number of retries for verification * @pmac_enabled: indicates if the preemptible MAC is enabled * @verify_time: time for verification in milliseconds * @tx_enabled: indicates if transmission is enabled */ struct ethtool_mmsv { const struct ethtool_mmsv_ops *ops; struct net_device *dev; spinlock_t lock; enum ethtool_mm_verify_status status; struct timer_list verify_timer; bool verify_enabled; int verify_retries; bool pmac_enabled; u32 verify_time; bool tx_enabled; }; void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv); void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up); void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv, enum ethtool_mmsv_event event); void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_state *state); void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg); void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev, const struct ethtool_mmsv_ops *ops); /** * struct ethtool_rxfh_param - RXFH (RSS) parameters * @hfunc: Defines the current RSS hash function used by HW (or to be set to). * Valid values are one of the %ETH_RSS_HASH_*. * @indir_size: On SET, the array size of the user buffer for the * indirection table, which may be zero, or * %ETH_RXFH_INDIR_NO_CHANGE. On GET (read from the driver), * the array size of the hardware indirection table. * @indir: The indirection table of size @indir_size entries. * @key_size: On SET, the array size of the user buffer for the hash key, * which may be zero. On GET (read from the driver), the size of the * hardware hash key. * @key: The hash key of size @key_size bytes. * @rss_context: RSS context identifier. Context 0 is the default for normal * traffic; other contexts can be referenced as the destination for RX flow * classification rules. On SET, %ETH_RXFH_CONTEXT_ALLOC is used * to allocate a new RSS context; on return this field will * contain the ID of the newly allocated context. * @rss_delete: Set to non-ZERO to remove the @rss_context context. * @input_xfrm: Defines how the input data is transformed. Valid values are one * of %RXH_XFRM_*. */ struct ethtool_rxfh_param { u8 hfunc; u32 indir_size; u32 *indir; u32 key_size; u8 *key; u32 rss_context; u8 rss_delete; u8 input_xfrm; }; /** * struct ethtool_rxfh_fields - Rx Flow Hashing (RXFH) header field config * @data: which header fields are used for hashing, bitmask of RXH_* defines * @flow_type: L2-L4 network traffic flow type * @rss_context: RSS context, will only be used if rxfh_per_ctx_fields is * set in struct ethtool_ops */ struct ethtool_rxfh_fields { u32 data; u32 flow_type; u32 rss_context; }; /** * struct kernel_ethtool_ts_info - kernel copy of struct ethtool_ts_info * @cmd: command number = %ETHTOOL_GET_TS_INFO * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags * @phc_index: device index of the associated PHC, or -1 if there is none * @phc_qualifier: qualifier of the associated PHC * @phc_source: source device of the associated PHC * @phc_phyindex: index of PHY device source of the associated PHC * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values */ struct kernel_ethtool_ts_info { u32 cmd; u32 so_timestamping; int phc_index; enum hwtstamp_provider_qualifier phc_qualifier; enum hwtstamp_source phc_source; int phc_phyindex; u32 tx_types; u32 rx_filters; }; /** * struct ethtool_ops - optional netdev operations * @supported_input_xfrm: supported types of input xfrm from %RXH_XFRM_*. * @cap_link_lanes_supported: indicates if the driver supports lanes * parameter. * @rxfh_per_ctx_fields: device supports selecting different header fields * for Rx hash calculation and RSS for each additional context. * @rxfh_per_ctx_key: device supports setting different RSS key for each * additional context. Netlink API should report hfunc, key, and input_xfrm * for every context, not just context 0. * @cap_rss_rxnfc_adds: device supports nonzero ring_cookie in filters with * %FLOW_RSS flag; the queue ID from the filter is added to the value from * the indirection table to determine the delivery queue. * @rxfh_indir_space: max size of RSS indirection tables, if indirection table * size as returned by @get_rxfh_indir_size may change during lifetime * of the device. Leave as 0 if the table size is constant. * @rxfh_key_space: same as @rxfh_indir_space, but for the key. * @rxfh_priv_size: size of the driver private data area the core should * allocate for an RSS context (in &struct ethtool_rxfh_context). * @rxfh_max_num_contexts: maximum (exclusive) supported RSS context ID. * If this is zero then the core may choose any (nonzero) ID, otherwise * the core will only use IDs strictly less than this value, as the * @rss_context argument to @create_rxfh_context and friends. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. * @supported_hwtstamp_qualifiers: bitfield of supported hwtstamp qualifier. * @get_drvinfo: Report driver/device information. Modern drivers no * longer have to implement this callback. Most fields are * correctly filled in by the core using system information, or * populated using other driver operations. * @get_regs_len: Get buffer length required for @get_regs * @get_regs: Get device registers * @get_wol: Report whether Wake-on-Lan is enabled * @set_wol: Turn Wake-on-Lan on or off. Returns a negative error code * or zero. * @get_msglevel: Report driver message level. This should be the value * of the @msg_enable field used by netif logging functions. * @set_msglevel: Set driver message level * @nway_reset: Restart autonegotiation. Returns a negative error code * or zero. * @get_link: Report whether physical link is up. Will only be called if * the netdev is up. Should usually be set to ethtool_op_get_link(), * which uses netif_carrier_ok(). * @get_link_ext_state: Report link extended state. Should set link_ext_state and * link_ext_substate (link_ext_substate of 0 means link_ext_substate is unknown, * do not attach ext_substate attribute to netlink message). If link_ext_state * and link_ext_substate are unknown, return -ENODATA. If not implemented, * link_ext_state and link_ext_substate will not be sent to userspace. * @get_link_ext_stats: Read extra link-related counters. * @get_eeprom_len: Read range of EEPROM addresses for validation of * @get_eeprom and @set_eeprom requests. * Returns 0 if device does not support EEPROM access. * @get_eeprom: Read data from the device EEPROM. * Should fill in the magic field. Don't need to check len for zero * or wraparound. Fill in the data argument with the eeprom values * from offset to offset + len. Update len to the amount read. * Returns an error or zero. * @set_eeprom: Write data to the device EEPROM. * Should validate the magic field. Don't need to check len for zero * or wraparound. Update len to the amount written. Returns an error * or zero. * @get_coalesce: Get interrupt coalescing parameters. Returns a negative * error code or zero. * @set_coalesce: Set interrupt coalescing parameters. Supported coalescing * types should be set in @supported_coalesce_params. * Returns a negative error code or zero. * @get_ringparam: Report ring sizes * @set_ringparam: Set ring sizes. Returns a negative error code or zero. * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. * @get_pauseparam: Report pause parameters * @set_pauseparam: Set pause parameters. Returns a negative error code * or zero. * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED * attached to it. The implementation may update the indicator * asynchronously or synchronously, but in either case it must return * quickly. It is initially called with the argument %ETHTOOL_ID_ACTIVE, * and must either activate asynchronous updates and return zero, return * a negative error or return a positive frequency for synchronous * indication (e.g. 1 for one on/off cycle per second). If it returns * a frequency then it will be called again at intervals with the * argument %ETHTOOL_ID_ON or %ETHTOOL_ID_OFF and should set the state of * the indicator accordingly. Finally, it is called with the argument * %ETHTOOL_ID_INACTIVE and must deactivate the indicator. Returns a * negative error code or zero. * @get_ethtool_stats: Return extended statistics about the device. * This is only useful if the device maintains statistics not * included in &struct rtnl_link_stats64. * @begin: Function to be called before any other operation. Returns a * negative error code or zero. * @complete: Function to be called after any other operation except * @begin. Will be called even if the other operation failed. * @get_priv_flags: Report driver-specific feature flags. * @set_priv_flags: Set driver-specific feature flags. Returns a negative * error code or zero. * @get_sset_count: Get number of strings that @get_strings will write. * @get_rxnfc: Get RX flow classification rules. Returns a negative * error code or zero. * @set_rxnfc: Set RX flow classification rules. Returns a negative * error code or zero. * @flash_device: Write a firmware image to device's flash memory. * Returns a negative error code or zero. * @reset: Reset (part of) the device, as specified by a bitmask of * flags from &enum ethtool_reset_flags. Returns a negative * error code or zero. * @get_rx_ring_count: Return the number of RX rings * @get_rxfh_key_size: Get the size of the RX flow hash key. * Returns zero if not supported for this specific device. * @get_rxfh_indir_size: Get the size of the RX flow hash indirection table. * Returns zero if not supported for this specific device. * @get_rxfh: Get the contents of the RX flow hash indirection table, hash key * and/or hash function. * Returns a negative error code or zero. * @set_rxfh: Set the contents of the RX flow hash indirection table, hash * key, and/or hash function. Arguments which are set to %NULL or zero * will remain unchanged. * Returns a negative error code or zero. An error code must be returned * if at least one unsupported change was requested. * @get_rxfh_fields: Get header fields used for flow hashing. * @set_rxfh_fields: Set header fields used for flow hashing. * @create_rxfh_context: Create a new RSS context with the specified RX flow * hash indirection table, hash key, and hash function. * The &struct ethtool_rxfh_context for this context is passed in @ctx; * note that the indir table, hkey and hfunc are not yet populated as * of this call. The driver does not need to update these; the core * will do so if this op succeeds. * However, if @rxfh.indir is set to %NULL, the driver must update the * indir table in @ctx with the (default or inherited) table actually in * use; similarly, if @rxfh.key is %NULL, @rxfh.hfunc is * %ETH_RSS_HASH_NO_CHANGE, or @rxfh.input_xfrm is %RXH_XFRM_NO_CHANGE, * the driver should update the corresponding information in @ctx. * If the driver provides this method, it must also provide * @modify_rxfh_context and @remove_rxfh_context. * Returns a negative error code or zero. * @modify_rxfh_context: Reconfigure the specified RSS context. Allows setting * the contents of the RX flow hash indirection table, hash key, and/or * hash function associated with the given context. * Parameters which are set to %NULL or zero will remain unchanged. * The &struct ethtool_rxfh_context for this context is passed in @ctx; * note that it will still contain the *old* settings. The driver does * not need to update these; the core will do so if this op succeeds. * Returns a negative error code or zero. An error code must be returned * if at least one unsupported change was requested. * @remove_rxfh_context: Remove the specified RSS context. * The &struct ethtool_rxfh_context for this context is passed in @ctx. * Returns a negative error code or zero. * @get_channels: Get number of channels. * @set_channels: Set number of channels. Returns a negative error code or * zero. * @get_dump_flag: Get dump flag indicating current dump length, version, * and flag of the device. * @get_dump_data: Get dump data. * @set_dump: Set dump specific flags to the device. * @get_ts_info: Get the time stamping and PTP hardware clock capabilities. * It may be called with RCU, or rtnl or reference on the device. * Drivers supporting transmit time stamps in software should set this to * ethtool_op_get_ts_info(). * @get_ts_stats: Query the device hardware timestamping statistics. Drivers * must not zero statistics which they don't report. The stats structure * is initialized to ETHTOOL_STAT_NOT_SET indicating driver does not * report statistics. * @get_module_info: Get the size and type of the eeprom contained within * a plug-in module. * @get_module_eeprom: Get the eeprom information from the plug-in module * @get_eee: Get Energy-Efficient (EEE) supported and status. * @set_eee: Set EEE status (enable/disable) as well as LPI timers. * @get_tunable: Read the value of a driver / device tunable. * @set_tunable: Set the value of a driver / device tunable. * @get_per_queue_coalesce: Get interrupt coalescing parameters per queue. * It must check that the given queue number is valid. If neither a RX nor * a TX queue has this number, return -EINVAL. If only a RX queue or a TX * queue has this number, set the inapplicable fields to ~0 and return 0. * Returns a negative error code or zero. * @set_per_queue_coalesce: Set interrupt coalescing parameters per queue. * It must check that the given queue number is valid. If neither a RX nor * a TX queue has this number, return -EINVAL. If only a RX queue or a TX * queue has this number, ignore the inapplicable fields. Supported * coalescing types should be set in @supported_coalesce_params. * Returns a negative error code or zero. * @get_link_ksettings: Get various device settings including Ethernet link * settings. The %cmd and %link_mode_masks_nwords fields should be * ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), * any change to them will be overwritten by kernel. Returns a negative * error code or zero. * @set_link_ksettings: Set various device settings including Ethernet link * settings. The %cmd and %link_mode_masks_nwords fields should be * ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), * any change to them will be overwritten by kernel. Returns a negative * error code or zero. * @get_fec_stats: Report FEC statistics. * Core will sum up per-lane stats to get the total. * Drivers must not zero statistics which they don't report. The stats * structure is initialized to ETHTOOL_STAT_NOT_SET indicating driver does * not report statistics. * @get_fecparam: Get the network device Forward Error Correction parameters. * @set_fecparam: Set the network device Forward Error Correction parameters. * @get_ethtool_phy_stats: Return extended statistics about the PHY device. * This is only useful if the device maintains PHY statistics and * cannot use the standard PHY library helpers. * @get_phy_tunable: Read the value of a PHY tunable. * @set_phy_tunable: Set the value of a PHY tunable. * @get_module_eeprom_by_page: Get a region of plug-in module EEPROM data from * specified page. Returns a negative error code or the amount of bytes * read. * @set_module_eeprom_by_page: Write to a region of plug-in module EEPROM, * from kernel space only. Returns a negative error code or zero. * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics. * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics. * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics. * @get_rmon_stats: Query some of the RMON (RFC 2819) statistics. * Set %ranges to a pointer to zero-terminated array of byte ranges. * @get_module_power_mode: Get the power mode policy for the plug-in module * used by the network device and its operational power mode, if * plugged-in. * @set_module_power_mode: Set the power mode policy for the plug-in module * used by the network device. * @get_mm: Query the 802.3 MAC Merge layer state. * @set_mm: Set the 802.3 MAC Merge layer parameters. * @get_mm_stats: Query the 802.3 MAC Merge layer statistics. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must * hold the RTNL lock. * * See the structures used by these operations for further documentation. * Note that for all operations using a structure ending with a zero- * length array, the array is allocated separately in the kernel and * is passed to the driver as an additional parameter. * * See &struct net_device and &struct net_device_ops for documentation * of the generic netdev features interface. */ struct ethtool_ops { u32 supported_input_xfrm:8; u32 cap_link_lanes_supported:1; u32 rxfh_per_ctx_fields:1; u32 rxfh_per_ctx_key:1; u32 cap_rss_rxnfc_adds:1; u32 rxfh_indir_space; u16 rxfh_key_space; u16 rxfh_priv_size; u32 rxfh_max_num_contexts; u32 supported_coalesce_params; u32 supported_ring_params; u32 supported_hwtstamp_qualifiers; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); int (*get_regs_len)(struct net_device *); void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); void (*get_wol)(struct net_device *, struct ethtool_wolinfo *); int (*set_wol)(struct net_device *, struct ethtool_wolinfo *); u32 (*get_msglevel)(struct net_device *); void (*set_msglevel)(struct net_device *, u32); int (*nway_reset)(struct net_device *); u32 (*get_link)(struct net_device *); int (*get_link_ext_state)(struct net_device *, struct ethtool_link_ext_state_info *); void (*get_link_ext_stats)(struct net_device *dev, struct ethtool_link_ext_stats *stats); int (*get_eeprom_len)(struct net_device *); int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); int (*set_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); int (*get_coalesce)(struct net_device *, struct ethtool_coalesce *, struct kernel_ethtool_coalesce *, struct netlink_ext_ack *); int (*set_coalesce)(struct net_device *, struct ethtool_coalesce *, struct kernel_ethtool_coalesce *, struct netlink_ext_ack *); void (*get_ringparam)(struct net_device *, struct ethtool_ringparam *, struct kernel_ethtool_ringparam *, struct netlink_ext_ack *); int (*set_ringparam)(struct net_device *, struct ethtool_ringparam *, struct kernel_ethtool_ringparam *, struct netlink_ext_ack *); void (*get_pause_stats)(struct net_device *dev, struct ethtool_pause_stats *pause_stats); void (*get_pauseparam)(struct net_device *, struct ethtool_pauseparam*); int (*set_pauseparam)(struct net_device *, struct ethtool_pauseparam*); void (*self_test)(struct net_device *, struct ethtool_test *, u64 *); void (*get_strings)(struct net_device *, u32 stringset, u8 *); int (*set_phys_id)(struct net_device *, enum ethtool_phys_id_state); void (*get_ethtool_stats)(struct net_device *, struct ethtool_stats *, u64 *); int (*begin)(struct net_device *); void (*complete)(struct net_device *); u32 (*get_priv_flags)(struct net_device *); int (*set_priv_flags)(struct net_device *, u32); int (*get_sset_count)(struct net_device *, int); int (*get_rxnfc)(struct net_device *, struct ethtool_rxnfc *, u32 *rule_locs); int (*set_rxnfc)(struct net_device *, struct ethtool_rxnfc *); int (*flash_device)(struct net_device *, struct ethtool_flash *); int (*reset)(struct net_device *, u32 *); u32 (*get_rx_ring_count)(struct net_device *dev); u32 (*get_rxfh_key_size)(struct net_device *); u32 (*get_rxfh_indir_size)(struct net_device *); int (*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *); int (*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *, struct netlink_ext_ack *extack); int (*get_rxfh_fields)(struct net_device *, struct ethtool_rxfh_fields *); int (*set_rxfh_fields)(struct net_device *, const struct ethtool_rxfh_fields *, struct netlink_ext_ack *extack); int (*create_rxfh_context)(struct net_device *, struct ethtool_rxfh_context *ctx, const struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack); int (*modify_rxfh_context)(struct net_device *, struct ethtool_rxfh_context *ctx, const struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack); int (*remove_rxfh_context)(struct net_device *, struct ethtool_rxfh_context *ctx, u32 rss_context, struct netlink_ext_ack *extack); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); int (*get_dump_flag)(struct net_device *, struct ethtool_dump *); int (*get_dump_data)(struct net_device *, struct ethtool_dump *, void *); int (*set_dump)(struct net_device *, struct ethtool_dump *); int (*get_ts_info)(struct net_device *, struct kernel_ethtool_ts_info *); void (*get_ts_stats)(struct net_device *dev, struct ethtool_ts_stats *ts_stats); int (*get_module_info)(struct net_device *, struct ethtool_modinfo *); int (*get_module_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); int (*get_eee)(struct net_device *dev, struct ethtool_keee *eee); int (*set_eee)(struct net_device *dev, struct ethtool_keee *eee); int (*get_tunable)(struct net_device *, const struct ethtool_tunable *, void *); int (*set_tunable)(struct net_device *, const struct ethtool_tunable *, const void *); int (*get_per_queue_coalesce)(struct net_device *, u32, struct ethtool_coalesce *); int (*set_per_queue_coalesce)(struct net_device *, u32, struct ethtool_coalesce *); int (*get_link_ksettings)(struct net_device *, struct ethtool_link_ksettings *); int (*set_link_ksettings)(struct net_device *, const struct ethtool_link_ksettings *); void (*get_fec_stats)(struct net_device *dev, struct ethtool_fec_stats *fec_stats, struct ethtool_fec_hist *hist); int (*get_fecparam)(struct net_device *, struct ethtool_fecparam *); int (*set_fecparam)(struct net_device *, struct ethtool_fecparam *); void (*get_ethtool_phy_stats)(struct net_device *, struct ethtool_stats *, u64 *); int (*get_phy_tunable)(struct net_device *, const struct ethtool_tunable *, void *); int (*set_phy_tunable)(struct net_device *, const struct ethtool_tunable *, const void *); int (*get_module_eeprom_by_page)(struct net_device *dev, const struct ethtool_module_eeprom *page, struct netlink_ext_ack *extack); int (*set_module_eeprom_by_page)(struct net_device *dev, const struct ethtool_module_eeprom *page, struct netlink_ext_ack *extack); void (*get_eth_phy_stats)(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats); void (*get_eth_mac_stats)(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats); void (*get_eth_ctrl_stats)(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats); void (*get_rmon_stats)(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges); int (*get_module_power_mode)(struct net_device *dev, struct ethtool_module_power_mode_params *params, struct netlink_ext_ack *extack); int (*set_module_power_mode)(struct net_device *dev, const struct ethtool_module_power_mode_params *params, struct netlink_ext_ack *extack); int (*get_mm)(struct net_device *dev, struct ethtool_mm_state *state); int (*set_mm)(struct net_device *dev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack); void (*get_mm_stats)(struct net_device *dev, struct ethtool_mm_stats *stats); }; int ethtool_check_ops(const struct ethtool_ops *ops); struct ethtool_rx_flow_rule { struct flow_rule *rule; unsigned long priv[]; }; struct ethtool_rx_flow_spec_input { const struct ethtool_rx_flow_spec *fs; u32 rss_ctx; }; struct ethtool_rx_flow_rule * ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input); void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *rule); bool ethtool_virtdev_validate_cmd(const struct ethtool_link_ksettings *cmd); int ethtool_virtdev_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd, u32 *dev_speed, u8 *dev_duplex); /** * struct ethtool_netdev_state - per-netdevice state for ethtool features * @rss_ctx: XArray of custom RSS contexts * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. * @wol_enabled: Wake-on-LAN is enabled * @module_fw_flash_in_progress: Module firmware flashing is in progress. */ struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; unsigned wol_enabled:1; unsigned module_fw_flash_in_progress:1; }; struct phy_device; struct phy_tdr_config; struct phy_plca_cfg; struct phy_plca_status; /** * struct ethtool_phy_ops - Optional PHY device options * @get_sset_count: Get number of strings that @get_strings will write. * @get_strings: Return a set of strings that describe the requested objects * @get_stats: Return extended statistics about the PHY device. * @get_plca_cfg: Return PLCA configuration. * @set_plca_cfg: Set PLCA configuration. * @get_plca_status: Get PLCA configuration. * @start_cable_test: Start a cable test * @start_cable_test_tdr: Start a Time Domain Reflectometry cable test * * All operations are optional (i.e. the function pointer may be set to %NULL) * and callers must take this into account. Callers must hold the RTNL lock. */ struct ethtool_phy_ops { int (*get_sset_count)(struct phy_device *dev); int (*get_strings)(struct phy_device *dev, u8 *data); int (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); int (*get_plca_cfg)(struct phy_device *dev, struct phy_plca_cfg *plca_cfg); int (*set_plca_cfg)(struct phy_device *dev, const struct phy_plca_cfg *plca_cfg, struct netlink_ext_ack *extack); int (*get_plca_status)(struct phy_device *dev, struct phy_plca_status *plca_st); int (*start_cable_test)(struct phy_device *phydev, struct netlink_ext_ack *extack); int (*start_cable_test_tdr)(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config); }; /** * ethtool_set_ethtool_phy_ops - Set the ethtool_phy_ops singleton * @ops: Ethtool PHY operations to set */ void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops); /** * ethtool_params_from_link_mode - Derive link parameters from a given link mode * @link_ksettings: Link parameters to be derived from the link mode * @link_mode: Link mode */ void ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings, enum ethtool_link_mode_bit_indices link_mode); /** * ethtool_get_phc_vclocks - Derive phc vclocks information, and caller * is responsible to free memory of vclock_index * @dev: pointer to net_device structure * @vclock_index: pointer to pointer of vclock index * * Return: number of phc vclocks */ int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index); /* Some generic methods drivers may use in their ethtool_ops */ u32 ethtool_op_get_link(struct net_device *dev); int ethtool_op_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *eti); /** * ethtool_mm_frag_size_add_to_min - Translate (standard) additional fragment * size expressed as multiplier into (absolute) minimum fragment size * value expressed in octets * @val_add: Value of addFragSize multiplier */ static inline u32 ethtool_mm_frag_size_add_to_min(u32 val_add) { return (ETH_ZLEN + ETH_FCS_LEN) * (1 + val_add) - ETH_FCS_LEN; } /** * ethtool_mm_frag_size_min_to_add - Translate (absolute) minimum fragment size * expressed in octets into (standard) additional fragment size expressed * as multiplier * @val_min: Value of addFragSize variable in octets * @val_add: Pointer where the standard addFragSize value is to be returned * @extack: Netlink extended ack * * Translate a value in octets to one of 0, 1, 2, 3 according to the reverse * application of the 802.3 formula 64 * (1 + addFragSize) - 4. To be called * by drivers which do not support programming the minimum fragment size to a * continuous range. Returns error on other fragment length values. */ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, struct netlink_ext_ack *extack) { u32 add_frag_size; for (add_frag_size = 0; add_frag_size < 4; add_frag_size++) { if (ethtool_mm_frag_size_add_to_min(add_frag_size) == val_min) { *val_add = add_frag_size; return 0; } } NL_SET_ERR_MSG_MOD(extack, "minFragSize required to be one of 60, 124, 188 or 252"); return -EINVAL; } /** * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer. * @dev: pointer to net_device structure * @info: buffer to hold the result * Returns: zero on success, non-zero otherwise. */ int ethtool_get_ts_info_by_layer(struct net_device *dev, struct kernel_ethtool_ts_info *info); /** * ethtool_sprintf - Write formatted string to ethtool string data * @data: Pointer to a pointer to the start of string to update * @fmt: Format of string to write * * Write formatted string to *data. Update *data to point at start of * next string. */ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); /** * ethtool_puts - Write string to ethtool string data * @data: Pointer to a pointer to the start of string to update * @str: String to write * * Write string to *data without a trailing newline. Update *data * to point at start of next string. * * Prefer this function to ethtool_sprintf() when given only * two arguments or if @fmt is just "%s". */ extern void ethtool_puts(u8 **data, const char *str); /** * ethtool_cpy - Write possibly-not-NUL-terminated string to ethtool string data * @data: Pointer to a pointer to the start of string to write into * @str: NUL-byte padded char array of size ETH_GSTRING_LEN to copy from */ #define ethtool_cpy(data, str) do { \ BUILD_BUG_ON(sizeof(str) != ETH_GSTRING_LEN); \ memcpy(*(data), str, ETH_GSTRING_LEN); \ *(data) += ETH_GSTRING_LEN; \ } while (0) /* Link mode to forced speed capabilities maps */ struct ethtool_forced_speed_map { u32 speed; __ETHTOOL_DECLARE_LINK_MODE_MASK(caps); const u32 *cap_arr; u32 arr_size; }; #define ETHTOOL_FORCED_SPEED_MAP(prefix, value) \ { \ .speed = SPEED_##value, \ .cap_arr = prefix##_##value, \ .arr_size = ARRAY_SIZE(prefix##_##value), \ } void ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size); #endif /* _LINUX_ETHTOOL_H */
6 14 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 // SPDX-License-Identifier: GPL-2.0-only /* * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS * * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 * Wensong Zhang <wensong@linuxvirtualserver.org> */ #define pr_fmt(fmt) "IPVS: " fmt #include <linux/in.h> #include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/ip_vs.h> /* TODO: struct isakmp_hdr { __u8 icookie[8]; __u8 rcookie[8]; __u8 np; __u8 version; __u8 xchgtype; __u8 flags; __u32 msgid; __u32 length; }; */ #define PORT_ISAKMP 500 static void ah_esp_conn_fill_param_proto(struct netns_ipvs *ipvs, int af, const struct ip_vs_iphdr *iph, struct ip_vs_conn_param *p) { if (likely(!ip_vs_iph_inverse(iph))) ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->saddr, htons(PORT_ISAKMP), &iph->daddr, htons(PORT_ISAKMP), p); else ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->daddr, htons(PORT_ISAKMP), &iph->saddr, htons(PORT_ISAKMP), p); } static struct ip_vs_conn * ah_esp_conn_in_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_in_get(&p); if (!cp) { /* * We are not sure if the packet is from our * service, so our conn_schedule hook should return NF_ACCEPT */ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " "%s%s %s->%s\n", ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } return cp; } static struct ip_vs_conn * ah_esp_conn_out_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_out_get(&p); if (!cp) { IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " "%s%s %s->%s\n", ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } return cp; } static int ah_esp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { /* * AH/ESP is only related traffic. Pass the packet to IP stack. */ *verdict = NF_ACCEPT; return 0; } #ifdef CONFIG_IP_VS_PROTO_AH struct ip_vs_protocol ip_vs_protocol_ah = { .name = "AH", .protocol = IPPROTO_AH, .num_states = 1, .dont_defrag = 1, .init = NULL, .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, .snat_handler = NULL, .dnat_handler = NULL, .state_transition = NULL, .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ }; #endif #ifdef CONFIG_IP_VS_PROTO_ESP struct ip_vs_protocol ip_vs_protocol_esp = { .name = "ESP", .protocol = IPPROTO_ESP, .num_states = 1, .dont_defrag = 1, .init = NULL, .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, .snat_handler = NULL, .dnat_handler = NULL, .state_transition = NULL, .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ }; #endif
13 9 8 8 7 7 7 3 5 4 1 1 1 14 14 12 12 11 3 1 4 4 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> #include <net/inet_sock.h> #include <net/raw.h> #include <net/rawv6.h> #ifdef pr_fmt # undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt static struct raw_hashinfo * raw_get_hashinfo(const struct inet_diag_req_v2 *r) { if (r->sdiag_family == AF_INET) { return &raw_v4_hashinfo; #if IS_ENABLED(CONFIG_IPV6) } else if (r->sdiag_family == AF_INET6) { return &raw_v6_hashinfo; #endif } else { return ERR_PTR(-EINVAL); } } /* * Due to requirement of not breaking user API we can't simply * rename @pad field in inet_diag_req_v2 structure, instead * use helper to figure it out. */ static bool raw_lookup(struct net *net, const struct sock *sk, const struct inet_diag_req_v2 *req) { struct inet_diag_req_raw *r = (void *)req; if (r->sdiag_family == AF_INET) return raw_v4_match(net, sk, r->sdiag_raw_protocol, r->id.idiag_dst[0], r->id.idiag_src[0], r->id.idiag_if, 0); #if IS_ENABLED(CONFIG_IPV6) else return raw_v6_match(net, sk, r->sdiag_raw_protocol, (const struct in6_addr *)r->id.idiag_src, (const struct in6_addr *)r->id.idiag_dst, r->id.idiag_if, 0); #endif return false; } static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) { struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct hlist_head *hlist; struct sock *sk; int slot; if (IS_ERR(hashinfo)) return ERR_CAST(hashinfo); rcu_read_lock(); for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { if (raw_lookup(net, sk, r)) { /* * Grab it and keep until we fill * diag message to be reported, so * caller should call sock_put then. */ if (refcount_inc_not_zero(&sk->sk_refcnt)) goto out_unlock; } } } sk = ERR_PTR(-ENOENT); out_unlock: rcu_read_unlock(); return sk; } static int raw_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { struct sk_buff *in_skb = cb->skb; struct sk_buff *rep; struct sock *sk; struct net *net; int err; net = sock_net(in_skb->sk); sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) { sock_put(sk); return -ENOMEM; } err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); sock_put(sk); if (err < 0) { kfree_skb(rep); return err; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); return err; } static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, bool net_admin) { if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); } static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); int num, s_num, slot, s_slot; struct hlist_head *hlist; struct sock *sk = NULL; if (IS_ERR(hashinfo)) return; s_slot = cb->args[0]; num = s_num = cb->args[1]; rcu_read_lock(); for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { num = 0; hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) goto out_unlock; next: num++; } } out_unlock: rcu_read_unlock(); cb->args[0] = slot; cb->args[1] = num; } static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = sk_rmem_alloc_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int raw_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *r) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } #endif static const struct inet_diag_handler raw_diag_handler = { .owner = THIS_MODULE, .dump = raw_diag_dump, .dump_one = raw_diag_dump_one, .idiag_get_info = raw_diag_get_info, .idiag_type = IPPROTO_RAW, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = raw_diag_destroy, #endif }; static void __always_unused __check_inet_diag_req_raw(void) { /* * Make sure the two structures are identical, * except the @pad field. */ #define __offset_mismatch(m1, m2) \ (offsetof(struct inet_diag_req_v2, m1) != \ offsetof(struct inet_diag_req_raw, m2)) BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) != sizeof(struct inet_diag_req_raw)); BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family)); BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext)); BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states)); BUILD_BUG_ON(__offset_mismatch(id, id)); #undef __offset_mismatch } static int __init raw_diag_init(void) { return inet_diag_register(&raw_diag_handler); } static void __exit raw_diag_exit(void) { inet_diag_unregister(&raw_diag_handler); } module_init(raw_diag_init); module_exit(raw_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAW socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
465 616 387 564 569 7 63 262 12 18 133 92 25 2 4 19 4 18 3 16 237 119 130 237 8 7 80 201 25 65 4 8 11 21 5 16 2 1 6 3 11 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. */ #include <linux/export.h> #include <net/ipv6.h> /* * find out if nexthdr is a well-known extension header or a protocol */ bool ipv6_ext_hdr(u8 nexthdr) { /* * find out if nexthdr is an extension header or a protocol */ return (nexthdr == NEXTHDR_HOP) || (nexthdr == NEXTHDR_ROUTING) || (nexthdr == NEXTHDR_FRAGMENT) || (nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) || (nexthdr == NEXTHDR_DEST); } EXPORT_SYMBOL(ipv6_ext_hdr); /* * Skip any extension headers. This is used by the ICMP module. * * Note that strictly speaking this conflicts with RFC 2460 4.0: * ...The contents and semantics of each extension header determine whether * or not to proceed to the next header. Therefore, extension headers must * be processed strictly in the order they appear in the packet; a * receiver must not, for example, scan through a packet looking for a * particular kind of extension header and process that header prior to * processing all preceding ones. * * We do exactly this. This is a protocol bug. We can't decide after a * seeing an unknown discard-with-error flavour TLV option if it's a * ICMP error message or not (errors should never be send in reply to * ICMP error messages). * * But I see no other way to do this. This might need to be reexamined * when Linux implements ESP (and maybe AUTH) headers. * --AK * * This function parses (probably truncated) exthdr set "hdr". * "nexthdrp" initially points to some place, * where type of the first header can be found. * * It skips all well-known exthdrs, and returns pointer to the start * of unparsable area i.e. the first header with unknown type. * If it is not NULL *nexthdr is updated by type/protocol of this header. * * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. * - it may return pointer pointing beyond end of packet, * if the last recognized header is truncated in the middle. * - if packet is truncated, so that all parsed headers are skipped, * it returns NULL. * - First fragment header is skipped, not-first ones * are considered as unparsable. * - Reports the offset field of the final fragment header so it is * possible to tell whether this is a first fragment, later fragment, * or not fragmented. * - ESP is unparsable for now and considered like * normal payload protocol. * - Note also special handling of AUTH header. Thanks to IPsec wizards. * * --ANK (980726) */ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, __be16 *frag_offp) { u8 nexthdr = *nexthdrp; *frag_offp = 0; while (ipv6_ext_hdr(nexthdr)) { struct ipv6_opt_hdr _hdr, *hp; int hdrlen; if (nexthdr == NEXTHDR_NONE) return -1; hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -1; if (nexthdr == NEXTHDR_FRAGMENT) { __be16 _frag_off, *fp; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -1; *frag_offp = *fp; if (ntohs(*frag_offp) & ~0x7) break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); nexthdr = hp->nexthdr; start += hdrlen; } *nexthdrp = nexthdr; return start; } EXPORT_SYMBOL(ipv6_skip_exthdr); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) { const unsigned char *nh = skb_network_header(skb); int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); struct ipv6_opt_hdr *hdr; int len; if (offset + 2 > packet_len) goto bad; hdr = (struct ipv6_opt_hdr *)(nh + offset); len = ((hdr->hdrlen + 1) << 3); if (offset + len > packet_len) goto bad; offset += 2; len -= 2; while (len > 0) { int opttype = nh[offset]; int optlen; if (opttype == type) return offset; switch (opttype) { case IPV6_TLV_PAD1: optlen = 1; break; default: if (len < 2) goto bad; optlen = nh[offset + 1] + 2; if (optlen > len) goto bad; break; } offset += optlen; len -= optlen; } /* not_found */ bad: return -1; } EXPORT_SYMBOL_GPL(ipv6_find_tlv); /* * find the offset to specified header or the protocol number of last header * if target < 0. "last header" is transport protocol header, ESP, or * "No next header". * * Note that *offset is used as input/output parameter, and if it is not zero, * then it must be a valid offset to an inner IPv6 header. This can be used * to explore inner IPv6 header, eg. ICMPv6 error messages. * * If target header is found, its offset is set in *offset and return protocol * number. Otherwise, return -1. * * If the first fragment doesn't contain the final protocol header or * NEXTHDR_NONE it is considered invalid. * * Note that non-1st fragment is special case that "the protocol number * of last header" is "next header" field in Fragment header. In this case, * *offset is meaningless and fragment offset is stored in *fragoff if fragoff * isn't NULL. * * if flags is not NULL and it's a fragment, then the frag flag * IP6_FH_F_FRAG will be set. If it's an AH header, the * IP6_FH_F_AUTH flag is set and target < 0, then this function will * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this * function will skip all those routing headers, where segements_left was 0. */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *flags) { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; bool found; if (fragoff) *fragoff = 0; if (*offset) { struct ipv6hdr _ip6, *ip6; ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); if (!ip6 || (ip6->version != 6)) return -EBADMSG; start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } do { struct ipv6_opt_hdr _hdr, *hp; unsigned int hdrlen; found = (nexthdr == target); if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { if (target < 0 || found) break; return -ENOENT; } hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -EBADMSG; if (nexthdr == NEXTHDR_ROUTING) { struct ipv6_rt_hdr _rh, *rh; rh = skb_header_pointer(skb, start, sizeof(_rh), &_rh); if (!rh) return -EBADMSG; if (flags && (*flags & IP6_FH_F_SKIP_RH) && rh->segments_left == 0) found = false; } if (nexthdr == NEXTHDR_FRAGMENT) { unsigned short _frag_off; __be16 *fp; if (flags) /* Indicate that this is a fragment */ *flags |= IP6_FH_F_FRAG; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -EBADMSG; _frag_off = ntohs(*fp) & ~0x7; if (_frag_off) { if (target < 0 && ((!ipv6_ext_hdr(hp->nexthdr)) || hp->nexthdr == NEXTHDR_NONE)) { if (fragoff) *fragoff = _frag_off; return hp->nexthdr; } if (!found) return -ENOENT; if (fragoff) *fragoff = _frag_off; break; } hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0)) break; hdrlen = ipv6_authlen(hp); } else hdrlen = ipv6_optlen(hp); if (!found) { nexthdr = hp->nexthdr; start += hdrlen; } } while (!found); *offset = start; return nexthdr; } EXPORT_SYMBOL(ipv6_find_hdr);
59 59 90 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ #include "bat_v_ogm.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> #include "hard-interface.h" #include "hash.h" #include "log.h" #include "originator.h" #include "routing.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" /** * batadv_v_ogm_orig_get() - retrieve and possibly create an originator node * @bat_priv: the bat priv with all the mesh interface information * @addr: the address of the originator * * Return: the orig_node corresponding to the specified address. If such an * object does not exist, it is allocated here. In case of allocation failure * returns NULL. */ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int hash_added; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) return orig_node; orig_node = batadv_orig_node_new(bat_priv, addr); if (!orig_node) return NULL; kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) { /* remove refcnt for newly created orig_node and hash entry */ batadv_orig_node_put(orig_node); batadv_orig_node_put(orig_node); orig_node = NULL; } return orig_node; } /** * batadv_v_ogm_start_queue_timer() - restart the OGM aggregation timer * @hard_iface: the interface to use to send the OGM */ static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface) { unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000; /* msecs * [0.9, 1.1] */ msecs += get_random_u32_below(msecs / 5) - (msecs / 10); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq, msecs_to_jiffies(msecs / 1000)); } /** * batadv_v_ogm_start_timer() - restart the OGM sending timer * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) { unsigned long msecs; /* this function may be invoked in different contexts (ogm rescheduling * or hard_iface activation), but the work timer should not be reset */ if (delayed_work_pending(&bat_priv->bat_v.ogm_wq)) return; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; msecs += get_random_u32_below(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq, msecs_to_jiffies(msecs)); } /** * batadv_v_ogm_send_to_if() - send a batman ogm using a given interface * @skb: the OGM to send * @hard_iface: the interface to use to send the OGM */ static void batadv_v_ogm_send_to_if(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); if (hard_iface->if_status != BATADV_IF_ACTIVE) { kfree_skb(skb); return; } batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, skb->len + ETH_HLEN); batadv_send_broadcast_skb(skb, hard_iface); } /** * batadv_v_ogm_len() - OGMv2 packet length * @skb: the OGM to check * * Return: Length of the given OGMv2 packet, including tvlv length, excluding * ethernet header length. */ static unsigned int batadv_v_ogm_len(struct sk_buff *skb) { struct batadv_ogm2_packet *ogm_packet; ogm_packet = (struct batadv_ogm2_packet *)skb->data; return BATADV_OGM2_HLEN + ntohs(ogm_packet->tvlv_len); } /** * batadv_v_ogm_queue_left() - check if given OGM still fits aggregation queue * @skb: the OGM to check * @hard_iface: the interface to use to send the OGM * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. * * Return: True, if the given OGMv2 packet still fits, false otherwise. */ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { unsigned int max = min_t(unsigned int, hard_iface->net_dev->mtu, BATADV_MAX_AGGREGATION_BYTES); unsigned int ogm_len = batadv_v_ogm_len(skb); lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); return hard_iface->bat_v.aggr_len + ogm_len <= max; } /** * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue * @hard_iface: the interface holding the aggregation queue * * Empties the OGMv2 aggregation queue and frees all the skbs it contains. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) { lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); __skb_queue_purge(&hard_iface->bat_v.aggr_list); hard_iface->bat_v.aggr_len = 0; } /** * batadv_v_ogm_aggr_send() - flush & send aggregation queue * @hard_iface: the interface with the aggregation queue to flush * * Aggregates all OGMv2 packets currently in the aggregation queue into a * single OGMv2 packet and transmits this aggregate. * * The aggregation queue is empty after this call. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) { unsigned int aggr_len = hard_iface->bat_v.aggr_len; struct sk_buff *skb_aggr; unsigned int ogm_len; struct sk_buff *skb; lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); if (!aggr_len) return; skb_aggr = dev_alloc_skb(aggr_len + ETH_HLEN + NET_IP_ALIGN); if (!skb_aggr) { batadv_v_ogm_aggr_list_free(hard_iface); return; } skb_reserve(skb_aggr, ETH_HLEN + NET_IP_ALIGN); skb_reset_network_header(skb_aggr); while ((skb = __skb_dequeue(&hard_iface->bat_v.aggr_list))) { hard_iface->bat_v.aggr_len -= batadv_v_ogm_len(skb); ogm_len = batadv_v_ogm_len(skb); skb_put_data(skb_aggr, skb->data, ogm_len); consume_skb(skb); } batadv_v_ogm_send_to_if(skb_aggr, hard_iface); } /** * batadv_v_ogm_queue_on_if() - queue a batman ogm on a given interface * @skb: the OGM to queue * @hard_iface: the interface to queue the OGM on */ static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); if (!atomic_read(&bat_priv->aggregated_ogms)) { batadv_v_ogm_send_to_if(skb, hard_iface); return; } spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); if (!batadv_v_ogm_queue_left(skb, hard_iface)) batadv_v_ogm_aggr_send(hard_iface); hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb); __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** * batadv_v_ogm_send_meshif() - periodic worker broadcasting the own OGM * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) { struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; struct sk_buff *skb, *skb_tmp; unsigned char *ogm_buff; struct list_head *iter; int ogm_buff_len; u16 tvlv_len = 0; int ret; lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) goto out; ogm_buff = bat_priv->bat_v.ogm_buff; ogm_buff_len = bat_priv->bat_v.ogm_buff_len; /* tt changes have to be committed before the tvlv data is * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff, &ogm_buff_len, BATADV_OGM2_HLEN); bat_priv->bat_v.ogm_buff = ogm_buff; bat_priv->bat_v.ogm_buff_len = ogm_buff_len; skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len); if (!skb) goto reschedule; skb_reserve(skb, ETH_HLEN); skb_put_data(skb, ogm_buff, ogm_buff_len); ogm_packet = (struct batadv_ogm2_packet *)skb->data; ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno)); atomic_inc(&bat_priv->bat_v.ogm_seqno); ogm_packet->tvlv_len = htons(tvlv_len); /* broadcast on every interface */ rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (!kref_get_unless_zero(&hard_iface->refcount)) continue; ret = batadv_hardif_no_broadcast(hard_iface, NULL, NULL); if (ret) { char *type; switch (ret) { case BATADV_HARDIF_BCAST_NORECIPIENT: type = "no neighbor"; break; case BATADV_HARDIF_BCAST_DUPFWD: type = "single neighbor is source"; break; case BATADV_HARDIF_BCAST_DUPORIG: type = "single neighbor is originator"; break; default: type = "unknown"; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 from ourselves on %s suppressed: %s\n", hard_iface->net_dev->name, type); batadv_hardif_put(hard_iface); continue; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n", ogm_packet->orig, ntohl(ogm_packet->seqno), ntohl(ogm_packet->throughput), ogm_packet->ttl, hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); /* this skb gets consumed by batadv_v_ogm_send_to_if() */ skb_tmp = skb_clone(skb, GFP_ATOMIC); if (!skb_tmp) { batadv_hardif_put(hard_iface); break; } batadv_v_ogm_queue_on_if(skb_tmp, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); consume_skb(skb); reschedule: batadv_v_ogm_start_timer(bat_priv); out: return; } /** * batadv_v_ogm_send() - periodic worker broadcasting the own OGM * @work: work queue item */ static void batadv_v_ogm_send(struct work_struct *work) { struct batadv_priv_bat_v *bat_v; struct batadv_priv *bat_priv; bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work); bat_priv = container_of(bat_v, struct batadv_priv, bat_v); mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); batadv_v_ogm_send_meshif(bat_priv); mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } /** * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface * @work: work queue item * * Emits aggregated OGM messages in regular intervals. */ void batadv_v_ogm_aggr_work(struct work_struct *work) { struct batadv_hard_iface_bat_v *batv; struct batadv_hard_iface *hard_iface; batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work); hard_iface = container_of(batv, struct batadv_hard_iface, bat_v); spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_send(hard_iface); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_start_queue_timer(hard_iface); } /** * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V * @hard_iface: the interface to prepare * * Takes care of scheduling its own OGM sending routine for this interface. * * Return: 0 on success or a negative error code otherwise */ int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); batadv_v_ogm_start_queue_timer(hard_iface); batadv_v_ogm_start_timer(bat_priv); return 0; } /** * batadv_v_ogm_iface_disable() - release OGM interface private resources * @hard_iface: interface for which the resources have to be released */ void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { cancel_delayed_work_sync(&hard_iface->bat_v.aggr_wq); spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_list_free(hard_iface); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** * batadv_v_ogm_primary_iface_set() - set a new primary interface * @primary_iface: the new primary interface */ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) { struct batadv_priv *bat_priv = netdev_priv(primary_iface->mesh_iface); struct batadv_ogm2_packet *ogm_packet; mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); if (!bat_priv->bat_v.ogm_buff) goto unlock; ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff; ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr); unlock: mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } /** * batadv_v_forward_penalty() - apply a penalty to the throughput metric * forwarded with B.A.T.M.A.N. V OGMs * @bat_priv: the bat priv with all the mesh interface information * @if_incoming: the interface where the OGM has been received * @if_outgoing: the interface where the OGM has to be forwarded to * @throughput: the current throughput * * Apply a penalty on the current throughput metric value based on the * characteristic of the interface where the OGM has been received. * * Initially the per hardif hop penalty is applied to the throughput. After * that the return value is then computed as follows: * - throughput * 50% if the incoming and outgoing interface are the * same WiFi interface and the throughput is above * 1MBit/s * - throughput if the outgoing interface is the default * interface (i.e. this OGM is processed for the * internal table and not forwarded) * - throughput * node hop penalty otherwise * * Return: the penalised throughput metric. */ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, u32 throughput) { int if_hop_penalty = atomic_read(&if_incoming->hop_penalty); int hop_penalty = atomic_read(&bat_priv->hop_penalty); int hop_penalty_max = BATADV_TQ_MAX_VALUE; /* Apply per hardif hop penalty */ throughput = throughput * (hop_penalty_max - if_hop_penalty) / hop_penalty_max; /* Don't apply hop penalty in default originator table. */ if (if_outgoing == BATADV_IF_DEFAULT) return throughput; /* Forwarding on the same WiFi interface cuts the throughput in half * due to the store & forward characteristics of WIFI. * Very low throughput values are the exception. */ if (throughput > 10 && if_incoming == if_outgoing && !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX)) return throughput / 2; /* hop penalty of 255 equals 100% */ return throughput * (hop_penalty_max - hop_penalty) / hop_penalty_max; } /** * batadv_v_ogm_forward() - check conditions and forward an OGM to the given * outgoing interface * @bat_priv: the bat priv with all the mesh interface information * @ogm_received: previously received OGM to be forwarded * @orig_node: the originator which has been updated * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface on which this OGM was received on * @if_outgoing: the interface to which the OGM has to be forwarded to * * Forward an OGM to an interface after having altered the throughput metric and * the TTL value contained in it. The original OGM isn't modified. */ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, const struct batadv_ogm2_packet *ogm_received, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *router = NULL; struct batadv_ogm2_packet *ogm_forward; unsigned char *skb_buff; struct sk_buff *skb; size_t packet_len; u16 tvlv_len; /* only forward for specific interfaces, not for the default one. */ if (if_outgoing == BATADV_IF_DEFAULT) goto out; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out; /* acquire possibly updated router */ router = batadv_orig_router_get(orig_node, if_outgoing); /* strict rule: forward packets coming from the best next hop only */ if (neigh_node != router) goto out; /* don't forward the same seqno twice on one interface */ if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm_received->seqno)) goto out; orig_ifinfo->last_seqno_forwarded = ntohl(ogm_received->seqno); if (ogm_received->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); goto out; } neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; tvlv_len = ntohs(ogm_received->tvlv_len); packet_len = BATADV_OGM2_HLEN + tvlv_len; skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev, ETH_HLEN + packet_len); if (!skb) goto out; skb_reserve(skb, ETH_HLEN); skb_buff = skb_put_data(skb, ogm_received, packet_len); /* apply forward penalty */ ogm_forward = (struct batadv_ogm2_packet *)skb_buff; ogm_forward->throughput = htonl(neigh_ifinfo->bat_v.throughput); ogm_forward->ttl--; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n", if_outgoing->net_dev->name, ntohl(ogm_forward->throughput), ogm_forward->ttl, if_incoming->net_dev->name); batadv_v_ogm_queue_on_if(skb, if_outgoing); out: batadv_orig_ifinfo_put(orig_ifinfo); batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(neigh_ifinfo); } /** * batadv_v_ogm_metric_update() - update route metric based on OGM * @bat_priv: the bat priv with all the mesh interface information * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered * * Return: * 1 if the OGM is new, * 0 if it is not new but valid, * <0 on error (e.g. old OGM) */ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; bool protection_started = false; int ret = -EINVAL; u32 path_throughput; s32 seq_diff; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out; seq_diff = ntohl(ogm2->seqno) - orig_ifinfo->last_real_seqno; if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, BATADV_OGM_MAX_AGE, &orig_ifinfo->batman_seqno_reset, &protection_started)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: packet within window protection time from %pM\n", ogm2->orig); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Last reset: %ld, %ld\n", orig_ifinfo->batman_seqno_reset, jiffies); goto out; } /* drop packets with old seqnos, however accept the first packet after * a host has been rebooted. */ if (seq_diff < 0 && !protection_started) goto out; neigh_node->last_seen = jiffies; orig_node->last_seen = jiffies; orig_ifinfo->last_real_seqno = ntohl(ogm2->seqno); orig_ifinfo->last_ttl = ogm2->ttl; neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; path_throughput = batadv_v_forward_penalty(bat_priv, if_incoming, if_outgoing, ntohl(ogm2->throughput)); neigh_ifinfo->bat_v.throughput = path_throughput; neigh_ifinfo->bat_v.last_seqno = ntohl(ogm2->seqno); neigh_ifinfo->last_ttl = ogm2->ttl; if (seq_diff > 0 || protection_started) ret = 1; else ret = 0; out: batadv_orig_ifinfo_put(orig_ifinfo); batadv_neigh_ifinfo_put(neigh_ifinfo); return ret; } /** * batadv_v_ogm_route_update() - update routes based on OGM * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered * * Return: true if the packet should be forwarded, false otherwise */ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv, const struct ethhdr *ethhdr, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *router = NULL; struct batadv_orig_node *orig_neigh_node; struct batadv_neigh_node *orig_neigh_router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL; u32 router_throughput, neigh_throughput; u32 router_last_seqno; u32 neigh_last_seqno; s32 neigh_seq_diff; bool forward = false; orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) goto out; orig_neigh_router = batadv_orig_router_get(orig_neigh_node, if_outgoing); /* drop packet if sender is not a direct neighbor and if we * don't route towards it */ router = batadv_orig_router_get(orig_node, if_outgoing); if (router && router->orig_node != orig_node && !orig_neigh_router) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out; } /* Mark the OGM to be considered for forwarding, and update routes * if needed. */ forward = true; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Searching and updating originator entry of received packet\n"); /* if this neighbor already is our next hop there is nothing * to change */ if (router == neigh_node) goto out; /* don't consider neighbours with worse throughput. * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than * the last received seqno from our best next hop. */ if (router) { router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); /* if these are not allocated, something is wrong. */ if (!router_ifinfo || !neigh_ifinfo) goto out; neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno; router_last_seqno = router_ifinfo->bat_v.last_seqno; neigh_seq_diff = neigh_last_seqno - router_last_seqno; router_throughput = router_ifinfo->bat_v.throughput; neigh_throughput = neigh_ifinfo->bat_v.throughput; if (neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF && router_throughput >= neigh_throughput) goto out; } batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); out: batadv_neigh_node_put(router); batadv_neigh_node_put(orig_neigh_router); batadv_orig_node_put(orig_neigh_node); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_ifinfo_put(neigh_ifinfo); return forward; } /** * batadv_v_ogm_process_per_outif() - process a batman v OGM for an outgoing if * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered */ static void batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, const struct ethhdr *ethhdr, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { int seqno_age; bool forward; /* first, update the metric with according sanity checks */ seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); /* outdated sequence numbers are to be discarded */ if (seqno_age < 0) return; /* only unknown & newer OGMs contain TVLVs we are interested in */ if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT) batadv_tvlv_containers_process(bat_priv, BATADV_OGM2, orig_node, NULL, (unsigned char *)(ogm2 + 1), ntohs(ogm2->tvlv_len)); /* if the metric update went through, update routes if needed */ forward = batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); /* if the routes have been processed correctly, check and forward */ if (forward) batadv_v_ogm_forward(bat_priv, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); } /** * batadv_v_ogm_aggr_packet() - checks if there is another OGM aggregated * @buff_pos: current position in the skb * @packet_len: total length of the skb * @ogm2_packet: potential OGM2 in buffer * * Return: true if there is enough space for another OGM, false otherwise. */ static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, const struct batadv_ogm2_packet *ogm2_packet) { int next_buff_pos = 0; /* check if there is enough space for the header */ next_buff_pos += buff_pos + sizeof(*ogm2_packet); if (next_buff_pos > packet_len) return false; /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm2_packet->tvlv_len); return next_buff_pos <= packet_len; } /** * batadv_v_ogm_process() - process an incoming batman v OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) * @if_incoming: the interface where this packet was received */ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct ethhdr *ethhdr; struct batadv_orig_node *orig_node = NULL; struct batadv_hardif_neigh_node *hardif_neigh = NULL; struct batadv_neigh_node *neigh_node = NULL; struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; u32 ogm_throughput, link_throughput, path_throughput; struct list_head *iter; int ret; ethhdr = eth_hdr(skb); ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset); ogm_throughput = ntohl(ogm_packet->throughput); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Received OGM2 packet via NB: %pM, IF: %s [%pM] (from OG: %pM, seqno %u, throughput %u, TTL %u, V %u, tvlv_len %u)\n", ethhdr->h_source, if_incoming->net_dev->name, if_incoming->net_dev->dev_addr, ogm_packet->orig, ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl, ogm_packet->version, ntohs(ogm_packet->tvlv_len)); if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from ourself\n"); return; } /* If the throughput metric is 0, immediately drop the packet. No need * to create orig_node / neigh_node for an unusable route. */ if (ogm_throughput == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet with throughput metric of 0\n"); return; } /* require ELP packets be to received from this neighbor first */ hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source); if (!hardif_neigh) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out; } orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig); if (!orig_node) goto out; neigh_node = batadv_neigh_node_get_or_create(orig_node, if_incoming, ethhdr->h_source); if (!neigh_node) goto out; /* Update the received throughput metric to match the link * characteristic: * - If this OGM traveled one hop so far (emitted by single hop * neighbor) the path throughput metric equals the link throughput. * - For OGMs traversing more than hop the path throughput metric is * the smaller of the path throughput and the link throughput. */ link_throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); path_throughput = min_t(u32, link_throughput, ogm_throughput); ogm_packet->throughput = htonl(path_throughput); batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, neigh_node, if_incoming, BATADV_IF_DEFAULT); rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; ret = batadv_hardif_no_broadcast(hard_iface, ogm_packet->orig, hardif_neigh->orig); if (ret) { char *type; switch (ret) { case BATADV_HARDIF_BCAST_NORECIPIENT: type = "no neighbor"; break; case BATADV_HARDIF_BCAST_DUPFWD: type = "single neighbor is source"; break; case BATADV_HARDIF_BCAST_DUPORIG: type = "single neighbor is originator"; break; default: type = "unknown"; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 packet from %pM on %s suppressed: %s\n", ogm_packet->orig, hard_iface->net_dev->name, type); batadv_hardif_put(hard_iface); continue; } batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, neigh_node, if_incoming, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); out: batadv_orig_node_put(orig_node); batadv_neigh_node_put(neigh_node); batadv_hardif_neigh_put(hardif_neigh); } /** * batadv_v_ogm_packet_recv() - OGM2 receiving handler * @skb: the received OGM * @if_incoming: the interface where this OGM has been received * * Return: NET_RX_SUCCESS and consume the skb on success or returns NET_RX_DROP * (without freeing the skb) on failure */ int batadv_v_ogm_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_ogm2_packet *ogm_packet; struct ethhdr *ethhdr; int ogm_offset; u8 *packet_pos; int ret = NET_RX_DROP; /* did we receive a OGM2 packet on an interface that does not have * B.A.T.M.A.N. V enabled ? */ if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0) goto free_skb; if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN)) goto free_skb; ethhdr = eth_hdr(skb); if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) goto free_skb; batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, skb->len + ETH_HLEN); ogm_offset = 0; ogm_packet = (struct batadv_ogm2_packet *)skb->data; while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb), ogm_packet)) { batadv_v_ogm_process(skb, ogm_offset, if_incoming); ogm_offset += BATADV_OGM2_HLEN; ogm_offset += ntohs(ogm_packet->tvlv_len); packet_pos = skb->data + ogm_offset; ogm_packet = (struct batadv_ogm2_packet *)packet_pos; } ret = NET_RX_SUCCESS; free_skb: if (ret == NET_RX_SUCCESS) consume_skb(skb); else kfree_skb(skb); return ret; } /** * batadv_v_ogm_init() - initialise the OGM2 engine * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or a negative error code in case of failure */ int batadv_v_ogm_init(struct batadv_priv *bat_priv) { struct batadv_ogm2_packet *ogm_packet; unsigned char *ogm_buff; u32 random_seqno; bat_priv->bat_v.ogm_buff_len = BATADV_OGM2_HLEN; ogm_buff = kzalloc(bat_priv->bat_v.ogm_buff_len, GFP_ATOMIC); if (!ogm_buff) return -ENOMEM; bat_priv->bat_v.ogm_buff = ogm_buff; ogm_packet = (struct batadv_ogm2_packet *)ogm_buff; ogm_packet->packet_type = BATADV_OGM2; ogm_packet->version = BATADV_COMPAT_VERSION; ogm_packet->ttl = BATADV_TTL; ogm_packet->flags = BATADV_NO_FLAGS; ogm_packet->throughput = htonl(BATADV_THROUGHPUT_MAX_VALUE); /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno); INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send); mutex_init(&bat_priv->bat_v.ogm_buff_mutex); return 0; } /** * batadv_v_ogm_free() - free OGM private resources * @bat_priv: the bat priv with all the mesh interface information */ void batadv_v_ogm_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq); mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); kfree(bat_priv->bat_v.ogm_buff); bat_priv->bat_v.ogm_buff = NULL; bat_priv->bat_v.ogm_buff_len = 0; mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); }
4195 4411 1 3656 3659 3662 3677 3697 3692 3700 21 3696 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/filesystems.c * * Copyright (C) 1991, 1992 Linus Torvalds * * table of configured filesystems */ #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> /* * Handling of filesystem drivers list. * Rules: * Inclusion to/removals from/scanning of list are protected by spinlock. * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or * 2) we hold the reference to the module. * The latter can be guaranteed by call of try_module_get(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) { __module_get(fs->owner); return fs; } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); } static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; for (p = &file_systems; *p; p = &(*p)->next) if (strncmp((*p)->name, name, len) == 0 && !(*p)->name[len]) break; return p; } /** * register_filesystem - register a new filesystem * @fs: the file system structure * * Adds the file system passed to the list of file systems the kernel * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ int register_filesystem(struct file_system_type * fs) { int res = 0; struct file_system_type ** p; if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; write_lock(&file_systems_lock); p = find_filesystem(fs->name, strlen(fs->name)); if (*p) res = -EBUSY; else *p = fs; write_unlock(&file_systems_lock); return res; } EXPORT_SYMBOL(register_filesystem); /** * unregister_filesystem - unregister a file system * @fs: filesystem to unregister * * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ int unregister_filesystem(struct file_system_type * fs) { struct file_system_type ** tmp; write_lock(&file_systems_lock); tmp = &file_systems; while (*tmp) { if (fs == *tmp) { *tmp = fs->next; fs->next = NULL; write_unlock(&file_systems_lock); synchronize_rcu(); return 0; } tmp = &(*tmp)->next; } write_unlock(&file_systems_lock); return -EINVAL; } EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL static int fs_index(const char __user * __name) { struct file_system_type * tmp; struct filename *name; int err, index; name = getname(__name); err = PTR_ERR(name); if (IS_ERR(name)) return err; err = -EINVAL; read_lock(&file_systems_lock); for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { if (strcmp(tmp->name, name->name) == 0) { err = index; break; } } read_unlock(&file_systems_lock); putname(name); return err; } static int fs_name(unsigned int index, char __user * buf) { struct file_system_type * tmp; int len, res = -EINVAL; read_lock(&file_systems_lock); for (tmp = file_systems; tmp; tmp = tmp->next, index--) { if (index == 0) { if (try_module_get(tmp->owner)) res = 0; break; } } read_unlock(&file_systems_lock); if (res) return res; /* OK, we got the reference, so we can safely block */ len = strlen(tmp->name) + 1; res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; put_filesystem(tmp); return res; } static int fs_maxindex(void) { struct file_system_type * tmp; int index; read_lock(&file_systems_lock); for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) ; read_unlock(&file_systems_lock); return index; } /* * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { int retval = -EINVAL; switch (option) { case 1: retval = fs_index((const char __user *) arg1); break; case 2: retval = fs_name(arg1, (char __user *) arg2); break; case 3: retval = fs_maxindex(); break; } return retval; } #endif int __init list_bdev_fs_names(char *buf, size_t size) { struct file_system_type *p; size_t len; int count = 0; read_lock(&file_systems_lock); for (p = file_systems; p; p = p->next) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; if (len > size) { pr_warn("%s: truncating file system list\n", __func__); break; } memcpy(buf, p->name, len); buf += len; size -= len; count++; } read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS static int filesystems_proc_show(struct seq_file *m, void *v) { struct file_system_type * tmp; read_lock(&file_systems_lock); tmp = file_systems; while (tmp) { seq_printf(m, "%s\t%s\n", (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); return 0; } static int __init proc_filesystems_init(void) { proc_create_single("filesystems", 0, NULL, filesystems_proc_show); return 0; } module_init(proc_filesystems_init); #endif static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); if (fs && !try_module_get(fs->owner)) fs = NULL; read_unlock(&file_systems_lock); return fs; } struct file_system_type *get_fs_type(const char *name) { struct file_system_type *fs; const char *dot = strchr(name, '.'); int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len); if (!fs) pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n", len, name); } if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); fs = NULL; } return fs; } EXPORT_SYMBOL(get_fs_type);
5 5 1 1 179 175 5 1 346 347 10 340 297 292 7 1 4 1 312 262 5 300 1 14 1 8 4 9 353 7 3 2 3 7 8 357 7 290 17 3 357 3 7 353 8 355 357 357 357 21 21 312 357 65 303 304 304 304 304 303 291 80 22 303 79 79 4 5 4 4 8 36 5 12 16 6 20 1 41 24 57 8 66 65 66 53 22 53 50 16 16 16 53 53 21 53 50 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <net/xfrm.h> #include <linux/siphash.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <uapi/linux/netfilter/nf_nat.h> #include "nf_internals.h" #define NF_NAT_MAX_ATTEMPTS 128 #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; static siphash_aligned_key_t nf_nat_hash_rnd; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; struct rcu_head rcu_head; }; struct nf_nat_hooks_net { struct nf_hook_ops *nat_hook_ops; unsigned int users; }; struct nat_net { struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; }; #ifdef CONFIG_XFRM static void nf_nat_ipv4_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi4 *fl4 = &fl->u.ip4; if (ct->status & statusbit) { fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } } static void nf_nat_ipv6_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { #if IS_ENABLED(CONFIG_IPV6) const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi6 *fl6 = &fl->u.ip6; if (ct->status & statusbit) { fl6->daddr = t->dst.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl6->saddr = t->src.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } #endif } static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; unsigned long statusbit; u8 family; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return; family = nf_ct_l3num(ct); dir = CTINFO2DIR(ctinfo); if (dir == IP_CT_DIR_ORIGINAL) statusbit = IPS_DST_NAT; else statusbit = IPS_SRC_NAT; switch (family) { case NFPROTO_IPV4: nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); return; case NFPROTO_IPV6: nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); return; } } #endif /* CONFIG_XFRM */ /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int hash_by_src(const struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int hash; struct { struct nf_conntrack_man src; u32 net_mix; u32 protonum; u32 zone; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); memset(&combined, 0, sizeof(combined)); /* Original src, to ensure we map it consistently if poss. */ combined.src = tuple->src; combined.net_mix = net_hash_mix(net); combined.protonum = tuple->dst.protonum; /* Zone ID can be used provided its valid for both directions */ if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) combined.zone = zone->id; hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } /** * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry * @tuple: proposed NAT binding * @ignored_conntrack: our (unconfirmed) conntrack entry * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple in either direction. * * Example: * INITIATOR -> NAT/PAT -> RESPONDER * * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). * Then, later, NAT/PAT itself also connects to RESPONDER. * * This will not work if the SNAT done earlier has same IP:PORT source pair. * * Conntrack table has: * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT * * and new locally originating connection wants: * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT * * ... which would mean incoming packets cannot be distinguished between * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple). * * @return: true if the proposed NAT mapping collides with an existing entry. */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { /* Conntrack tracking doesn't keep track of outgoing tuples; only * incoming ones. NAT means they don't have a fixed mapping, * so we invert the tuple and look for the incoming reply. * * We could keep a separate hash if this proves too slow. */ struct nf_conntrack_tuple reply; nf_ct_invert_tuple(&reply, tuple); return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } static bool nf_nat_allow_clash(const struct nf_conn *ct) { return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash; } /** * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry * @tuple: proposed NAT binding * @ignored_ct: our (unconfirmed) conntrack entry * * Same as nf_nat_used_tuple, but also check for rare clash in reverse * direction. Should be called only when @tuple has not been altered, i.e. * @ignored_conntrack will not be subject to NAT. * * @return: true if the proposed NAT mapping collides with existing entry. */ static noinline bool nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_ct) { static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST; const struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conn *ct; bool taken = true; struct net *net; if (!nf_nat_used_tuple(tuple, ignored_ct)) return false; if (!nf_nat_allow_clash(ignored_ct)) return true; /* Initial choice clashes with existing conntrack. * Check for (rare) reverse collision. * * This can happen when new packets are received in both directions * at the exact same time on different CPUs. * * Without SMP, first packet creates new conntrack entry and second * packet is resolved as established reply packet. * * With parallel processing, both packets could be picked up as * new and both get their own ct entry allocated. * * If ignored_conntrack and colliding ct are not subject to NAT then * pretend the tuple is available and let later clash resolution * handle this at insertion time. * * Without it, the 'reply' packet has its source port rewritten * by nat engine. */ if (READ_ONCE(ignored_ct->status) & uses_nat) return true; net = nf_ct_net(ignored_ct); zone = nf_ct_zone(ignored_ct); thash = nf_conntrack_find_get(net, zone, tuple); if (unlikely(!thash)) { struct nf_conntrack_tuple reply; nf_ct_invert_tuple(&reply, tuple); thash = nf_conntrack_find_get(net, zone, &reply); if (!thash) /* clashing entry went away */ return false; } ct = nf_ct_tuplehash_to_ctrack(thash); /* clashing connection subject to NAT? Retry with new tuple. */ if (READ_ONCE(ct->status) & uses_nat) goto out; if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple)) taken = false; out: nf_ct_put(ct); return taken; } static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) { static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | IPS_DYING; static const unsigned long flags_needed = IPS_SRC_NAT; enum tcp_conntrack old_state; old_state = READ_ONCE(ct->proto.tcp.state); if (old_state < TCP_CONNTRACK_TIME_WAIT) return false; if (flags & flags_refuse) return false; return (flags & flags_needed) == flags_needed; } /* reverse direction will send packets to new source, so * make sure such packets are invalid. */ static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) { return (__s32)(new->proto.tcp.seen[0].td_end - old->proto.tcp.seen[0].td_end) > 0; } static int nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack, unsigned int attempts_left) { static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple reply; unsigned long flags; struct nf_conn *ct; bool taken = true; struct net *net; nf_ct_invert_tuple(&reply, tuple); if (attempts_left > NF_NAT_HARDER_THRESH || tuple->dst.protonum != IPPROTO_TCP || ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) return nf_conntrack_tuple_taken(&reply, ignored_conntrack); /* :ast few attempts to find a free tcp port. Destructive * action: evict colliding if its in timewait state and the * tcp sequence number has advanced past the one used by the * old entry. */ net = nf_ct_net(ignored_conntrack); zone = nf_ct_zone(ignored_conntrack); thash = nf_conntrack_find_get(net, zone, &reply); if (!thash) return false; ct = nf_ct_tuplehash_to_ctrack(thash); if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) goto out; if (WARN_ON_ONCE(ct == ignored_conntrack)) goto out; flags = READ_ONCE(ct->status); if (!nf_nat_may_kill(ct, flags)) goto out; if (!nf_seq_has_advanced(ct, ignored_conntrack)) goto out; /* Even if we can evict do not reuse if entry is offloaded. */ if (nf_ct_kill(ct)) taken = flags & flags_offload; out: nf_ct_put(ct); return taken; } static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { if (t->src.l3num == NFPROTO_IPV4) return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; } /* Is the manipable part of the tuple between min and max incl? */ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, const union nf_conntrack_man_proto *max) { __be16 port; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; else port = tuple->dst.u.all; return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: return true; } } /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. */ if (range->flags & NF_NAT_RANGE_MAP_IPS && !nf_nat_inet_in_range(tuple, range)) return 0; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) return 1; return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, &range->min_proto, &range->max_proto); } static inline int same_src(const struct nf_conn *ct, const struct nf_conntrack_tuple *tuple) { const struct nf_conntrack_tuple *t; t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; return (t->dst.protonum == tuple->dst.protonum && nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && t->src.u.all == tuple->src.u.all); } /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { if (same_src(ct, tuple) && net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuple(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; if (nf_in_range(result, range)) return 1; } } return 0; } /* For [FUTURE] fragmentation handling, we want the least-used * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports * 1-65535, we don't do pro-rata allocation based on ports; we choose * the ip with the lowest src-ip/dst-ip/proto usage. */ static void find_best_ips_proto(const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) { union nf_inet_addr *var_ipp; unsigned int i, max; /* Host order */ u32 minip, maxip, j, dist; bool full_range; /* No IP mapping? Do nothing. */ if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return; if (maniptype == NF_NAT_MANIP_SRC) var_ipp = &tuple->src.u3; else var_ipp = &tuple->dst.u3; /* Fast path: only one choice. */ if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { *var_ipp = range->min_addr; return; } if (nf_ct_l3num(ct) == NFPROTO_IPV4) max = sizeof(var_ipp->ip) / sizeof(u32) - 1; else max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; /* Hashing source and destination IPs gives a fairly even * spread in practice (if there are a small number of IPs * involved, there usually aren't that many connections * anyway). The consistency means that servers see the same * client coming from the same IP (some Internet Banking sites * like this), even across reboots. */ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); full_range = false; for (i = 0; i <= max; i++) { /* If first bytes of the address are at the maximum, use the * distance. Otherwise use the full range. */ if (!full_range) { minip = ntohl((__force __be32)range->min_addr.all[i]); maxip = ntohl((__force __be32)range->max_addr.all[i]); dist = maxip - minip + 1; } else { minip = 0; dist = ~0; } var_ipp->all[i] = (__force __u32) htonl(minip + reciprocal_scale(j, dist)); if (var_ipp->all[i] != range->max_addr.all[i]) full_range = true; if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) j ^= (__force u32)tuple->dst.u3.all[i]; } } /* Alter the per-proto part of the tuple (depending on maniptype), to * give a unique tuple in the given range if possible. * * Per-protocol part of tuple is initialized to the incoming packet. */ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 0; range_size = 65536; } else { min = ntohs(range->min_proto.icmp.id); range_size = ntohs(range->max_proto.icmp.id) - ntohs(range->min_proto.icmp.id) + 1; } goto find_free_id; #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) case IPPROTO_GRE: /* If there is no master conntrack we are not PPTP, do not change tuples */ if (!ct->master) return; if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; else keyptr = &tuple->dst.u.gre.key; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 1; range_size = 65535; } else { min = ntohs(range->min_proto.gre.key); range_size = ntohs(range->max_proto.gre.key) - min + 1; } goto find_free_id; #endif case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else keyptr = &tuple->dst.u.all; break; default: return; } /* If no range specified... */ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { /* If it's dst rewrite, can't change port */ if (maniptype == NF_NAT_MANIP_DST) return; if (ntohs(*keyptr) < 1024) { /* Loose convention: >> 512 is credential passing */ if (ntohs(*keyptr) < 512) { min = 1; range_size = 511 - min + 1; } else { min = 600; range_size = 1023 - min + 1; } } else { min = 1024; range_size = 65535 - 1024 + 1; } } else { min = ntohs(range->min_proto.all); max = ntohs(range->max_proto.all); if (unlikely(max < min)) swap(max, min); range_size = max - min + 1; } find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || maniptype != NF_NAT_MANIP_DST) off = get_random_u16(); else off = 0; attempts = range_size; if (attempts > NF_NAT_MAX_ATTEMPTS) attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. * * If we can't find any free port from first offset, pick a new * one and try again, with ever smaller search window. */ another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } if (attempts >= range_size || attempts < 16) return; attempts /= 2; off = get_random_u16(); goto another_round; } /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __nf_conntrack_confirm and drop the packet. */ static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. * * This is only required for source (ie. NAT/masq) mappings. * So far, we don't do local source mappings, so multiple * manips not an issue. */ if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (nf_in_range(orig_tuple, range)) { if (!nf_nat_used_tuple_new(orig_tuple, ct)) { *tuple = *orig_tuple; return; } } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; } } /* 2) Select the least-used IP/proto combination in the given range */ *tuple = *orig_tuple; find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into * the range to make a unique tuple. */ /* Only bother mapping if it's not already in range and unique */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; } else if (!nf_nat_used_tuple(tuple, ct)) { return; } } /* Last chance: get protocol to try to obtain unique tuple. */ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); } struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) { struct nf_conn_nat *nat = nfct_nat(ct); if (nat) return nat; if (!nf_ct_is_confirmed(ct)) nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); return nat; } EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ if (nf_ct_is_confirmed(ct)) return NF_ACCEPT; WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); if (WARN_ON(nf_nat_initialized(ct, maniptype))) return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ nf_ct_invert_tuple(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ if (maniptype == NF_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; spinlock_t *lock; srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(lock); } /* It's done. */ if (maniptype == NF_NAT_MANIP_DST) ct->status |= IPS_DST_NAT_DONE; else ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } EXPORT_SYMBOL(nf_nat_setup_info); static unsigned int __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) { /* Force range to this IP; let proto decide mapping for * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). * Use reply in case it's already been mangled (eg local packet). */ union nf_inet_addr ip = (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); struct nf_nat_range2 range = { .flags = NF_NAT_RANGE_MAP_IPS, .min_addr = ip, .max_addr = ip, }; return nf_nat_setup_info(ct, &range, manip); } unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) { return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); } EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); /* Do packet manipulations according to nf_nat_setup_info. */ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb) { enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned int verdict = NF_ACCEPT; unsigned long statusbit; if (mtype == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply dir. */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; /* Non-atomic: these bits don't change. */ if (ct->status & statusbit) verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); return verdict; } EXPORT_SYMBOL_GPL(nf_nat_packet); static bool in_vrf_postrouting(const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (state->hook == NF_INET_POST_ROUTING && netif_is_l3_master(state->out)) return true; #endif return false; } unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct || in_vrf_postrouting(state)) return NF_ACCEPT; nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. */ if (!nf_nat_initialized(ct, maniptype)) { struct nf_nat_lookup_hook_priv *lpriv = priv; struct nf_hook_entries *e = rcu_dereference(lpriv->entries); unsigned int ret; int i; if (!e) goto null_bind; for (i = 0; i < e->num_hook_entries; i++) { ret = e->hooks[i].hook(e->hooks[i].priv, skb, state); if (ret != NF_ACCEPT) return ret; if (nf_nat_initialized(ct, maniptype)) goto do_nat; } null_bind: ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct, ct->status); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } break; default: /* ESTABLISHED */ WARN_ON(ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } do_nat: return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); return NF_DROP; } EXPORT_SYMBOL_GPL(nf_nat_inet_fn); struct nf_nat_proto_clean { u8 l3proto; u8 l4proto; }; /* kill conntracks with affected NAT section */ static int nf_nat_proto_remove(struct nf_conn *i, void *data) { const struct nf_nat_proto_clean *clean = data; if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) return 0; return i->status & IPS_NAT_MASK ? 1 : 0; } static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { if (nf_nat_proto_remove(ct, data)) return 1; /* This module is being removed and conntrack has nat null binding. * Remove it from bysource hash, as the table will be freed soon. * * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. */ return 0; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, }; static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_PROTONAT_PORT_MIN]) { range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); range->max_proto.all = range->min_proto.all; range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[CTA_PROTONAT_PORT_MAX]) { range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } return 0; } static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; int err; err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy, NULL); if (err < 0) return err; return nf_nat_l4proto_nlattr_to_range(tb, range); } static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_PROTO] = { .type = NLA_NESTED }, }; static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V4_MINIP]) { range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); range->flags |= NF_NAT_RANGE_MAP_IPS; } range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP], range->min_addr.ip); return 0; } static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V6_MINIP]) { nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], sizeof(struct in6_addr)); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V6_MAXIP]) nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], sizeof(struct in6_addr)); else range->max_addr = range->min_addr; return 0; } static int nfnetlink_parse_nat(const struct nlattr *nat, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_NAT_MAX+1]; int err; memset(range, 0, sizeof(*range)); err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL); if (err < 0) return err; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: err = nf_nat_ipv4_nlattr_to_range(tb, range); break; case NFPROTO_IPV6: err = nf_nat_ipv6_nlattr_to_range(tb, range); break; default: err = -EPROTONOSUPPORT; break; } if (err) return err; if (!tb[CTA_NAT_PROTO]) return 0; return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); } /* This function is called under rcu_read_lock() */ static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { struct nf_nat_range2 range; int err; /* Should not happen, restricted to creating new conntracks * via ctnetlink. */ if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) return -EEXIST; /* No NAT information has been passed, allocate the null-binding */ if (attr == NULL) return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; err = nfnetlink_parse_nat(attr, ct, &range); if (err < 0) return err; return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { return -EOPNOTSUPP; } #endif static struct nf_ct_helper_expectfn follow_master_nat = { .name = "nat-follow-master", .expectfn = nf_nat_follow_master, }; int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; unsigned int hooknum = ops->hooknum; struct nf_hook_ops *nat_ops; int i, ret; if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) return -EINVAL; nat_proto_net = &nat_net->nat_proto_net[pf]; for (i = 0; i < ops_count; i++) { if (orig_nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) return -EINVAL; mutex_lock(&nf_nat_proto_mutex); if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; } for (i = 0; i < ops_count; i++) { priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv) { nat_ops[i].priv = priv; continue; } mutex_unlock(&nf_nat_proto_mutex); while (i) kfree(nat_ops[--i].priv); kfree(nat_ops); return -ENOMEM; } ret = nf_register_net_hooks(net, nat_ops, ops_count); if (ret < 0) { mutex_unlock(&nf_nat_proto_mutex); for (i = 0; i < ops_count; i++) kfree(nat_ops[i].priv); kfree(nat_ops); return ret; } nat_proto_net->nat_hook_ops = nat_ops; } nat_ops = nat_proto_net->nat_hook_ops; priv = nat_ops[hooknum].priv; if (WARN_ON_ONCE(!priv)) { mutex_unlock(&nf_nat_proto_mutex); return -EOPNOTSUPP; } ret = nf_hook_entries_insert_raw(&priv->entries, ops); if (ret == 0) nat_proto_net->users++; mutex_unlock(&nf_nat_proto_mutex); return ret; } void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; struct nf_hook_ops *nat_ops; int hooknum = ops->hooknum; int i; if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) return; nat_proto_net = &nat_net->nat_proto_net[pf]; mutex_lock(&nf_nat_proto_mutex); if (WARN_ON(nat_proto_net->users == 0)) goto unlock; nat_proto_net->users--; nat_ops = nat_proto_net->nat_hook_ops; for (i = 0; i < ops_count; i++) { if (nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) goto unlock; priv = nat_ops[hooknum].priv; nf_hook_entries_delete_raw(&priv->entries, ops); if (nat_proto_net->users == 0) { nf_unregister_net_hooks(net, nat_ops, ops_count); for (i = 0; i < ops_count; i++) { priv = nat_ops[i].priv; kfree_rcu(priv, rcu_head); } nat_proto_net->nat_hook_ops = NULL; kfree(nat_ops); } unlock: mutex_unlock(&nf_nat_proto_mutex); } static struct pernet_operations nat_net_ops = { .id = &nat_net_id, .size = sizeof(struct nat_net), }; static const struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; static int __init nf_nat_init(void) { int ret, i; /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; if (nf_nat_htable_size < CONNTRACK_LOCKS) nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) return -ENOMEM; for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); ret = register_pernet_subsys(&nat_net_ops); if (ret < 0) { kvfree(nf_nat_bysource); return ret; } nf_ct_helper_expectfn_register(&follow_master_nat); WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); ret = register_nf_nat_bpf(); if (ret < 0) { RCU_INIT_POINTER(nf_nat_hook, NULL); nf_ct_helper_expectfn_unregister(&follow_master_nat); synchronize_net(); unregister_pernet_subsys(&nat_net_ops); kvfree(nf_nat_bysource); } return ret; } static void __exit nf_nat_cleanup(void) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); nf_ct_helper_expectfn_unregister(&follow_master_nat); RCU_INIT_POINTER(nf_nat_hook, NULL); synchronize_net(); kvfree(nf_nat_bysource); unregister_pernet_subsys(&nat_net_ops); } MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Network address translation core"); module_init(nf_nat_init); module_exit(nf_nat_cleanup);
181 181 135 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 // SPDX-License-Identifier: GPL-2.0-or-later /* * Sysfs attributes of bridge * Linux ethernet bridge * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_bridge.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/times.h> #include <linux/sched/signal.h> #include "br_private.h" /* IMPORTANT: new bridge options must be added with netlink support only * please do not add new sysfs entries */ #define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd))) /* * Common code for storing bridge parameters. */ static ssize_t store_bridge_parm(struct device *d, const char *buf, size_t len, int (*set)(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack)) { struct net_bridge *br = to_bridge(d); struct netlink_ext_ack extack = {0}; unsigned long val; int err; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; err = kstrtoul(buf, 0, &val); if (err != 0) return err; if (!rtnl_trylock()) return restart_syscall(); err = (*set)(br, val, &extack); if (!err) netdev_state_change(br->dev); if (extack._msg) { if (err) br_err(br, "%s\n", extack._msg); else br_warn(br, "%s\n", extack._msg); } rtnl_unlock(); return err ? err : len; } static ssize_t forward_delay_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } static int set_forward_delay(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_forward_delay(br, val); } static ssize_t forward_delay_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_forward_delay); } static DEVICE_ATTR_RW(forward_delay); static ssize_t hello_time_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->hello_time)); } static int set_hello_time(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_hello_time(br, val); } static ssize_t hello_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_hello_time); } static DEVICE_ATTR_RW(hello_time); static ssize_t max_age_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->max_age)); } static int set_max_age(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_max_age(br, val); } static ssize_t max_age_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_max_age); } static DEVICE_ATTR_RW(max_age); static ssize_t ageing_time_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); } static int set_ageing_time(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_ageing_time(br, val); } static ssize_t ageing_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_ageing_time); } static DEVICE_ATTR_RW(ageing_time); static ssize_t stp_state_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->stp_enabled); } static int set_stp_state(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_stp_set_enabled(br, val, extack); } static ssize_t stp_state_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_stp_state); } static DEVICE_ATTR_RW(stp_state); static ssize_t group_fwd_mask_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%#x\n", br->group_fwd_mask); } static int set_group_fwd_mask(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { if (val & BR_GROUPFWD_RESTRICTED) return -EINVAL; br->group_fwd_mask = val; return 0; } static ssize_t group_fwd_mask_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_group_fwd_mask); } static DEVICE_ATTR_RW(group_fwd_mask); static ssize_t priority_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); } static int set_priority(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_stp_set_bridge_priority(br, (u16) val); return 0; } static ssize_t priority_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_priority); } static DEVICE_ATTR_RW(priority); static ssize_t root_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->designated_root); } static DEVICE_ATTR_RO(root_id); static ssize_t bridge_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->bridge_id); } static DEVICE_ATTR_RO(bridge_id); static ssize_t root_port_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_port); } static DEVICE_ATTR_RO(root_port); static ssize_t root_path_cost_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost); } static DEVICE_ATTR_RO(root_path_cost); static ssize_t topology_change_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->topology_change); } static DEVICE_ATTR_RO(topology_change); static ssize_t topology_change_detected_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->topology_change_detected); } static DEVICE_ATTR_RO(topology_change_detected); static ssize_t hello_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer)); } static DEVICE_ATTR_RO(hello_timer); static ssize_t tcn_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer)); } static DEVICE_ATTR_RO(tcn_timer); static ssize_t topology_change_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); } static DEVICE_ATTR_RO(topology_change_timer); static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); } static DEVICE_ATTR_RO(gc_timer); static ssize_t group_addr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%pM\n", br->group_addr); } static ssize_t group_addr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct net_bridge *br = to_bridge(d); u8 new_addr[6]; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!mac_pton(buf, new_addr)) return -EINVAL; if (!is_link_local_ether_addr(new_addr)) return -EINVAL; if (new_addr[5] == 1 || /* 802.3x Pause address */ new_addr[5] == 2 || /* 802.3ad Slow protocols */ new_addr[5] == 3) /* 802.1X PAE address */ return -EINVAL; if (!rtnl_trylock()) return restart_syscall(); spin_lock_bh(&br->lock); ether_addr_copy(br->group_addr, new_addr); spin_unlock_bh(&br->lock); br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true); br_recalculate_fwd_mask(br); netdev_state_change(br->dev); rtnl_unlock(); return len; } static DEVICE_ATTR_RW(group_addr); static int set_flush(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct net_bridge_fdb_flush_desc desc = { .flags_mask = BIT(BR_FDB_STATIC) }; br_fdb_flush(br, &desc); return 0; } static ssize_t flush_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_flush); } static DEVICE_ATTR_WO(flush); static ssize_t no_linklocal_learn_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); } static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, extack); } static ssize_t no_linklocal_learn_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_no_linklocal_learn); } static DEVICE_ATTR_RW(no_linklocal_learn); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router); } static int set_multicast_router(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_router(&br->multicast_ctx, val); } static ssize_t multicast_router_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_router); } static DEVICE_ATTR_RW(multicast_router); static ssize_t multicast_snooping_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); } static ssize_t multicast_snooping_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_multicast_toggle); } static DEVICE_ATTR_RW(multicast_snooping); static ssize_t multicast_query_use_ifaddr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); } static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); return 0; } static ssize_t multicast_query_use_ifaddr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_use_ifaddr); } static DEVICE_ATTR_RW(multicast_query_use_ifaddr); static ssize_t multicast_querier_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier); } static int set_multicast_querier(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_querier(&br->multicast_ctx, val); } static ssize_t multicast_querier_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_querier); } static DEVICE_ATTR_RW(multicast_querier); static ssize_t hash_elasticity_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", RHT_ELASTICITY); } static int set_elasticity(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { /* 16 is RHT_ELASTICITY */ NL_SET_ERR_MSG_MOD(extack, "the hash_elasticity option has been deprecated and is always 16"); return 0; } static ssize_t hash_elasticity_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_elasticity); } static DEVICE_ATTR_RW(hash_elasticity); static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->hash_max); } static int set_hash_max(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->hash_max = val; return 0; } static ssize_t hash_max_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_hash_max); } static DEVICE_ATTR_RW(hash_max); static ssize_t multicast_igmp_version_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); } static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_igmp_version(&br->multicast_ctx, val); } static ssize_t multicast_igmp_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_igmp_version); } static DEVICE_ATTR_RW(multicast_igmp_version); static ssize_t multicast_last_member_count_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); } static int set_last_member_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_last_member_count = val; return 0; } static ssize_t multicast_last_member_count_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_count); } static DEVICE_ATTR_RW(multicast_last_member_count); static ssize_t multicast_startup_query_count_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); } static int set_startup_query_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_startup_query_count = val; return 0; } static ssize_t multicast_startup_query_count_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_count); } static DEVICE_ATTR_RW(multicast_startup_query_count); static ssize_t multicast_last_member_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); } static int set_last_member_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_last_member_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_interval); } static DEVICE_ATTR_RW(multicast_last_member_interval); static ssize_t multicast_membership_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); } static int set_membership_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_membership_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_membership_interval); } static DEVICE_ATTR_RW(multicast_membership_interval); static ssize_t multicast_querier_interval_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); } static int set_querier_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_querier_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_querier_interval); } static DEVICE_ATTR_RW(multicast_querier_interval); static ssize_t multicast_query_interval_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); } static int set_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_multicast_set_query_intvl(&br->multicast_ctx, val); return 0; } static ssize_t multicast_query_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_interval); } static DEVICE_ATTR_RW(multicast_query_interval); static ssize_t multicast_query_response_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); } static int set_query_response_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_query_response_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_response_interval); } static DEVICE_ATTR_RW(multicast_query_response_interval); static ssize_t multicast_startup_query_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); } static int set_startup_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_multicast_set_startup_query_intvl(&br->multicast_ctx, val); return 0; } static ssize_t multicast_startup_query_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_interval); } static DEVICE_ATTR_RW(multicast_startup_query_interval); static ssize_t multicast_stats_enabled_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); } static int set_stats_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val); return 0; } static ssize_t multicast_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_stats_enabled); } static DEVICE_ATTR_RW(multicast_stats_enabled); #if IS_ENABLED(CONFIG_IPV6) static ssize_t multicast_mld_version_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version); } static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_mld_version(&br->multicast_ctx, val); } static ssize_t multicast_mld_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_mld_version); } static DEVICE_ATTR_RW(multicast_mld_version); #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); } static int set_nf_call_iptables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); return 0; } static ssize_t nf_call_iptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_iptables); } static DEVICE_ATTR_RW(nf_call_iptables); static ssize_t nf_call_ip6tables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); } static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); return 0; } static ssize_t nf_call_ip6tables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_ip6tables); } static DEVICE_ATTR_RW(nf_call_ip6tables); static ssize_t nf_call_arptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); } static int set_nf_call_arptables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); return 0; } static ssize_t nf_call_arptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_arptables); } static DEVICE_ATTR_RW(nf_call_arptables); #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING static ssize_t vlan_filtering_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED)); } static ssize_t vlan_filtering_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_filter_toggle); } static DEVICE_ATTR_RW(vlan_filtering); static ssize_t vlan_protocol_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%#06x\n", ntohs(br->vlan_proto)); } static ssize_t vlan_protocol_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_set_proto); } static DEVICE_ATTR_RW(vlan_protocol); static ssize_t default_pvid_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->default_pvid); } static ssize_t default_pvid_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid); } static DEVICE_ATTR_RW(default_pvid); static ssize_t vlan_stats_enabled_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); } static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_vlan_set_stats(br, val); } static ssize_t vlan_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_vlan_stats_enabled); } static DEVICE_ATTR_RW(vlan_stats_enabled); static ssize_t vlan_stats_per_port_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); } static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_vlan_set_stats_per_port(br, val); } static ssize_t vlan_stats_per_port_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_vlan_stats_per_port); } static DEVICE_ATTR_RW(vlan_stats_per_port); #endif static struct attribute *bridge_attrs[] = { &dev_attr_forward_delay.attr, &dev_attr_hello_time.attr, &dev_attr_max_age.attr, &dev_attr_ageing_time.attr, &dev_attr_stp_state.attr, &dev_attr_group_fwd_mask.attr, &dev_attr_priority.attr, &dev_attr_bridge_id.attr, &dev_attr_root_id.attr, &dev_attr_root_path_cost.attr, &dev_attr_root_port.attr, &dev_attr_topology_change.attr, &dev_attr_topology_change_detected.attr, &dev_attr_hello_timer.attr, &dev_attr_tcn_timer.attr, &dev_attr_topology_change_timer.attr, &dev_attr_gc_timer.attr, &dev_attr_group_addr.attr, &dev_attr_flush.attr, &dev_attr_no_linklocal_learn.attr, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &dev_attr_multicast_router.attr, &dev_attr_multicast_snooping.attr, &dev_attr_multicast_querier.attr, &dev_attr_multicast_query_use_ifaddr.attr, &dev_attr_hash_elasticity.attr, &dev_attr_hash_max.attr, &dev_attr_multicast_last_member_count.attr, &dev_attr_multicast_startup_query_count.attr, &dev_attr_multicast_last_member_interval.attr, &dev_attr_multicast_membership_interval.attr, &dev_attr_multicast_querier_interval.attr, &dev_attr_multicast_query_interval.attr, &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, &dev_attr_multicast_stats_enabled.attr, &dev_attr_multicast_igmp_version.attr, #if IS_ENABLED(CONFIG_IPV6) &dev_attr_multicast_mld_version.attr, #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, &dev_attr_nf_call_ip6tables.attr, &dev_attr_nf_call_arptables.attr, #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING &dev_attr_vlan_filtering.attr, &dev_attr_vlan_protocol.attr, &dev_attr_default_pvid.attr, &dev_attr_vlan_stats_enabled.attr, &dev_attr_vlan_stats_per_port.attr, #endif NULL }; static const struct attribute_group bridge_group = { .name = SYSFS_BRIDGE_ATTR, .attrs = bridge_attrs, }; /* * Export the forwarding information table as a binary file * The records are struct __fdb_entry. * * Returns the number of bytes read. */ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); struct net_bridge *br = to_bridge(dev); int n; /* must read whole records */ if (off % sizeof(struct __fdb_entry) != 0) return -EINVAL; n = br_fdb_fillbuf(br, buf, count / sizeof(struct __fdb_entry), off / sizeof(struct __fdb_entry)); if (n > 0) n *= sizeof(struct __fdb_entry); return n; } static const struct bin_attribute bridge_forward = { .attr = { .name = SYSFS_BRIDGE_FDB, .mode = 0444, }, .read = brforward_read, }; /* * Add entries in sysfs onto the existing network class device * for the bridge. * Adds a attribute group "bridge" containing tuning parameters. * Binary attribute containing the forward table * Sub directory to hold links to interfaces. * * Note: the ifobj exists only to be a subdirectory * to hold links. The ifobj exists in same data structure * as it's parent the bridge so reference counting works. */ int br_sysfs_addbr(struct net_device *dev) { struct kobject *brobj = &dev->dev.kobj; struct net_bridge *br = netdev_priv(dev); int err; err = sysfs_create_group(brobj, &bridge_group); if (err) { pr_info("%s: can't create group %s/%s\n", __func__, dev->name, bridge_group.name); goto out1; } err = sysfs_create_bin_file(brobj, &bridge_forward); if (err) { pr_info("%s: can't create attribute file %s/%s\n", __func__, dev->name, bridge_forward.attr.name); goto out2; } br->ifobj = kobject_create_and_add(SYSFS_BRIDGE_PORT_SUBDIR, brobj); if (!br->ifobj) { pr_info("%s: can't add kobject (directory) %s/%s\n", __func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR); err = -ENOMEM; goto out3; } return 0; out3: sysfs_remove_bin_file(&dev->dev.kobj, &bridge_forward); out2: sysfs_remove_group(&dev->dev.kobj, &bridge_group); out1: return err; } void br_sysfs_delbr(struct net_device *dev) { struct kobject *kobj = &dev->dev.kobj; struct net_bridge *br = netdev_priv(dev); kobject_put(br->ifobj); sysfs_remove_bin_file(kobj, &bridge_forward); sysfs_remove_group(kobj, &bridge_group); }
3 2 1 30 30 30 54 54 1 1 1 2 5 1 4 15 4 4 9 9 1 10 5 10 1 8 8 2 7 25 26 17 16 11 10 1 10 6 6 1 1 17 10 7 29 6 62 62 62 62 8 57 5 4 11 46 55 51 51 26 24 3 3 3 3 47 48 2 40 40 43 27 9 9 17 12 2 19 2 9 31 2 2 5 22 28 1 2 1 1 3 3 2 2 1 2 2 1 1 8 1 1 6 1 3 2 4 1 1 1 1 29 27 3 29 8 3 5 5 2 2 8 4 4 4 3 1 1 2 3 3 3 1 2 2 27 1 12 1 5 1 7 1 6 2 50 50 50 50 52 1 51 52 2 50 52 52 52 1 52 40 13 52 1 52 52 25 16 9 2 22 296 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel Connection Multiplexor * * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */ #include <linux/bpf.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/file.h> #include <linux/filter.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/poll.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/splice.h> #include <linux/uaccess.h> #include <linux/workqueue.h> #include <linux/syscalls.h> #include <linux/sched/signal.h> #include <net/kcm.h> #include <net/netns/generic.h> #include <net/sock.h> #include <uapi/linux/kcm.h> #include <trace/events/sock.h> unsigned int kcm_net_id; static struct kmem_cache *kcm_psockp __read_mostly; static struct kmem_cache *kcm_muxp __read_mostly; static struct workqueue_struct *kcm_wq; static inline struct kcm_sock *kcm_sk(const struct sock *sk) { return (struct kcm_sock *)sk; } static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) { return (struct kcm_tx_msg *)skb->cb; } static void report_csk_error(struct sock *csk, int err) { csk->sk_err = EPIPE; sk_error_report(csk); } static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, bool wakeup_kcm) { struct sock *csk = psock->sk; struct kcm_mux *mux = psock->mux; /* Unrecoverable error in transmit */ spin_lock_bh(&mux->lock); if (psock->tx_stopped) { spin_unlock_bh(&mux->lock); return; } psock->tx_stopped = 1; KCM_STATS_INCR(psock->stats.tx_aborts); if (!psock->tx_kcm) { /* Take off psocks_avail list */ list_del(&psock->psock_avail_list); } else if (wakeup_kcm) { /* In this case psock is being aborted while outside of * write_msgs and psock is reserved. Schedule tx_work * to handle the failure there. Need to commit tx_stopped * before queuing work. */ smp_mb(); queue_work(kcm_wq, &psock->tx_kcm->tx_work); } spin_unlock_bh(&mux->lock); /* Report error on lower socket */ report_csk_error(csk, err); } /* RX mux lock held. */ static void kcm_update_rx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { STRP_STATS_ADD(mux->stats.rx_bytes, psock->strp.stats.bytes - psock->saved_rx_bytes); mux->stats.rx_msgs += psock->strp.stats.msgs - psock->saved_rx_msgs; psock->saved_rx_msgs = psock->strp.stats.msgs; psock->saved_rx_bytes = psock->strp.stats.bytes; } static void kcm_update_tx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { KCM_STATS_ADD(mux->stats.tx_bytes, psock->stats.tx_bytes - psock->saved_tx_bytes); mux->stats.tx_msgs += psock->stats.tx_msgs - psock->saved_tx_msgs; psock->saved_tx_msgs = psock->stats.tx_msgs; psock->saved_tx_bytes = psock->stats.tx_bytes; } static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); /* KCM is ready to receive messages on its queue-- either the KCM is new or * has become unblocked after being blocked on full socket buffer. Queue any * pending ready messages on a psock. RX mux lock held. */ static void kcm_rcv_ready(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; struct sk_buff *skb; if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) return; while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Assuming buffer limit has been reached */ skb_queue_head(&mux->rx_hold_queue, skb); WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); return; } } while (!list_empty(&mux->psocks_ready)) { psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, psock_ready_list); if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { /* Assuming buffer limit has been reached */ WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); return; } /* Consumed the ready message on the psock. Schedule rx_work to * get more messages. */ list_del(&psock->psock_ready_list); psock->ready_rx_msg = NULL; /* Commit clearing of ready_rx_msg for queuing work */ smp_mb(); strp_unpause(&psock->strp); strp_check_rcv(&psock->strp); } /* Buffer limit is okay now, add to ready list */ list_add_tail(&kcm->wait_rx_list, &kcm->mux->kcm_rx_waiters); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, true); } static void kcm_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct kcm_sock *kcm = kcm_sk(sk); struct kcm_mux *mux = kcm->mux; unsigned int len = skb->truesize; sk_mem_uncharge(sk, len); atomic_sub(len, &sk->sk_rmem_alloc); /* For reading rx_wait and rx_psock without holding lock */ smp_mb__after_atomic(); if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) && sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); spin_unlock_bh(&mux->rx_lock); } } static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) return -ENOMEM; if (!sk_rmem_schedule(sk, skb, skb->truesize)) return -ENOBUFS; skb->dev = NULL; skb_orphan(skb); skb->sk = sk; skb->destructor = kcm_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); skb_queue_tail(list, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 0; } /* Requeue received messages for a kcm socket to other kcm sockets. This is * called with a kcm socket is receive disabled. * RX mux lock held. */ static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) { struct sk_buff *skb; struct kcm_sock *kcm; while ((skb = skb_dequeue(head))) { /* Reset destructor to avoid calling kcm_rcv_ready */ skb->destructor = sock_rfree; skb_orphan(skb); try_again: if (list_empty(&mux->kcm_rx_waiters)) { skb_queue_tail(&mux->rx_hold_queue, skb); continue; } kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); /* Commit rx_wait to read in kcm_free */ smp_wmb(); goto try_again; } } } /* Lower sock lock held */ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, struct sk_buff *head) { struct kcm_mux *mux = psock->mux; struct kcm_sock *kcm; WARN_ON(psock->ready_rx_msg); if (psock->rx_kcm) return psock->rx_kcm; spin_lock_bh(&mux->rx_lock); if (psock->rx_kcm) { spin_unlock_bh(&mux->rx_lock); return psock->rx_kcm; } kcm_update_rx_mux_stats(mux, psock); if (list_empty(&mux->kcm_rx_waiters)) { psock->ready_rx_msg = head; strp_pause(&psock->strp); list_add_tail(&psock->psock_ready_list, &mux->psocks_ready); spin_unlock_bh(&mux->rx_lock); return NULL; } kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); psock->rx_kcm = kcm; /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_psock, psock); spin_unlock_bh(&mux->rx_lock); return kcm; } static void kcm_done(struct kcm_sock *kcm); static void kcm_done_work(struct work_struct *w) { kcm_done(container_of(w, struct kcm_sock, done_work)); } /* Lower sock held */ static void unreserve_rx_kcm(struct kcm_psock *psock, bool rcv_ready) { struct kcm_sock *kcm = psock->rx_kcm; struct kcm_mux *mux = psock->mux; if (!kcm) return; spin_lock_bh(&mux->rx_lock); psock->rx_kcm = NULL; /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_psock, NULL); /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with * kcm_rfree */ smp_mb(); if (unlikely(kcm->done)) { spin_unlock_bh(&mux->rx_lock); /* Need to run kcm_done in a task since we need to qcquire * callback locks which may already be held here. */ INIT_WORK(&kcm->done_work, kcm_done_work); schedule_work(&kcm->done_work); return; } if (unlikely(kcm->rx_disabled)) { requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { /* Check for degenerative race with rx_wait that all * data was dequeued (accounted for in kcm_rfree). */ kcm_rcv_ready(kcm); } spin_unlock_bh(&mux->rx_lock); } /* Lower sock lock held */ static void psock_data_ready(struct sock *sk) { struct kcm_psock *psock; trace_sk_data_ready(sk); read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; if (likely(psock)) strp_data_ready(&psock->strp); read_unlock_bh(&sk->sk_callback_lock); } /* Called with lower sock held */ static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct kcm_sock *kcm; try_queue: kcm = reserve_rx_kcm(psock, skb); if (!kcm) { /* Unable to reserve a KCM, message is held in psock and strp * is paused. */ return; } if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ unreserve_rx_kcm(psock, false); goto try_queue; } } static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct bpf_prog *prog = psock->bpf_prog; int res; res = bpf_prog_run_pin_on_cpu(prog, skb); return res; } static int kcm_read_sock_done(struct strparser *strp, int err) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); unreserve_rx_kcm(psock, true); return err; } static void psock_state_change(struct sock *sk) { /* TCP only does a EPOLLIN for a half close. Do a EPOLLHUP here * since application will normally not poll with EPOLLIN * on the TCP sockets. */ report_csk_error(sk, EPIPE); } static void psock_write_space(struct sock *sk) { struct kcm_psock *psock; struct kcm_mux *mux; struct kcm_sock *kcm; read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; if (unlikely(!psock)) goto out; mux = psock->mux; spin_lock_bh(&mux->lock); /* Check if the socket is reserved so someone is waiting for sending. */ kcm = psock->tx_kcm; if (kcm) queue_work(kcm_wq, &kcm->tx_work); spin_unlock_bh(&mux->lock); out: read_unlock_bh(&sk->sk_callback_lock); } static void unreserve_psock(struct kcm_sock *kcm); /* kcm sock is locked. */ static struct kcm_psock *reserve_psock(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; psock = kcm->tx_psock; smp_rmb(); /* Must read tx_psock before tx_wait */ if (psock) { WARN_ON(kcm->tx_wait); if (unlikely(psock->tx_stopped)) unreserve_psock(kcm); else return kcm->tx_psock; } spin_lock_bh(&mux->lock); /* Check again under lock to see if psock was reserved for this * psock via psock_unreserve. */ psock = kcm->tx_psock; if (unlikely(psock)) { WARN_ON(kcm->tx_wait); spin_unlock_bh(&mux->lock); return kcm->tx_psock; } if (!list_empty(&mux->psocks_avail)) { psock = list_first_entry(&mux->psocks_avail, struct kcm_psock, psock_avail_list); list_del(&psock->psock_avail_list); if (kcm->tx_wait) { list_del(&kcm->wait_psock_list); kcm->tx_wait = false; } kcm->tx_psock = psock; psock->tx_kcm = kcm; KCM_STATS_INCR(psock->stats.reserved); } else if (!kcm->tx_wait) { list_add_tail(&kcm->wait_psock_list, &mux->kcm_tx_waiters); kcm->tx_wait = true; } spin_unlock_bh(&mux->lock); return psock; } /* mux lock held */ static void psock_now_avail(struct kcm_psock *psock) { struct kcm_mux *mux = psock->mux; struct kcm_sock *kcm; if (list_empty(&mux->kcm_tx_waiters)) { list_add_tail(&psock->psock_avail_list, &mux->psocks_avail); } else { kcm = list_first_entry(&mux->kcm_tx_waiters, struct kcm_sock, wait_psock_list); list_del(&kcm->wait_psock_list); kcm->tx_wait = false; psock->tx_kcm = kcm; /* Commit before changing tx_psock since that is read in * reserve_psock before queuing work. */ smp_mb(); kcm->tx_psock = psock; KCM_STATS_INCR(psock->stats.reserved); queue_work(kcm_wq, &kcm->tx_work); } } /* kcm sock is locked. */ static void unreserve_psock(struct kcm_sock *kcm) { struct kcm_psock *psock; struct kcm_mux *mux = kcm->mux; spin_lock_bh(&mux->lock); psock = kcm->tx_psock; if (WARN_ON(!psock)) { spin_unlock_bh(&mux->lock); return; } smp_rmb(); /* Read tx_psock before tx_wait */ kcm_update_tx_mux_stats(mux, psock); WARN_ON(kcm->tx_wait); kcm->tx_psock = NULL; psock->tx_kcm = NULL; KCM_STATS_INCR(psock->stats.unreserved); if (unlikely(psock->tx_stopped)) { if (psock->done) { /* Deferred free */ list_del(&psock->psock_list); mux->psocks_cnt--; sock_put(psock->sk); fput(psock->sk->sk_socket->file); kmem_cache_free(kcm_psockp, psock); } /* Don't put back on available list */ spin_unlock_bh(&mux->lock); return; } psock_now_avail(psock); spin_unlock_bh(&mux->lock); } static void kcm_report_tx_retry(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; spin_lock_bh(&mux->lock); KCM_STATS_INCR(mux->stats.tx_retries); spin_unlock_bh(&mux->lock); } /* Write any messages ready on the kcm socket. Called with kcm sock lock * held. Return bytes actually sent or error. */ static int kcm_write_msgs(struct kcm_sock *kcm) { unsigned int total_sent = 0; struct sock *sk = &kcm->sk; struct kcm_psock *psock; struct sk_buff *head; int ret = 0; kcm->tx_wait_more = false; psock = kcm->tx_psock; if (unlikely(psock && psock->tx_stopped)) { /* A reserved psock was aborted asynchronously. Unreserve * it and we'll retry the message. */ unreserve_psock(kcm); kcm_report_tx_retry(kcm); if (skb_queue_empty(&sk->sk_write_queue)) return 0; kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false; } retry: while ((head = skb_peek(&sk->sk_write_queue))) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; struct kcm_tx_msg *txm = kcm_tx_msg(head); struct sk_buff *skb; unsigned int msize; int i; if (!txm->started_tx) { psock = reserve_psock(kcm); if (!psock) goto out; skb = head; txm->frag_offset = 0; txm->sent = 0; txm->started_tx = true; } else { if (WARN_ON(!psock)) { ret = -EINVAL; goto out; } skb = txm->frag_skb; } if (WARN_ON(!skb_shinfo(skb)->nr_frags) || WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) { ret = -EINVAL; goto out; } msize = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) msize += skb_frag_size(&skb_shinfo(skb)->frags[i]); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, (const struct bio_vec *)skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags, msize); iov_iter_advance(&msg.msg_iter, txm->frag_offset); do { ret = sock_sendmsg(psock->sk->sk_socket, &msg); if (ret <= 0) { if (ret == -EAGAIN) { /* Save state to try again when there's * write space on the socket */ txm->frag_skb = skb; ret = 0; goto out; } /* Hard failure in sending message, abort this * psock since it has lost framing * synchronization and retry sending the * message from the beginning. */ kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, true); unreserve_psock(kcm); psock = NULL; txm->started_tx = false; kcm_report_tx_retry(kcm); ret = 0; goto retry; } txm->sent += ret; txm->frag_offset += ret; KCM_STATS_ADD(psock->stats.tx_bytes, ret); } while (msg.msg_iter.count > 0); if (skb == head) { if (skb_has_frag_list(skb)) { txm->frag_skb = skb_shinfo(skb)->frag_list; txm->frag_offset = 0; continue; } } else if (skb->next) { txm->frag_skb = skb->next; txm->frag_offset = 0; continue; } /* Successfully sent the whole packet, account for it. */ sk->sk_wmem_queued -= txm->sent; total_sent += txm->sent; skb_dequeue(&sk->sk_write_queue); kfree_skb(head); KCM_STATS_INCR(psock->stats.tx_msgs); } out: if (!head) { /* Done with all queued messages. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); if (psock) unreserve_psock(kcm); } /* Check if write space is available */ sk->sk_write_space(sk); return total_sent ? : ret; } static void kcm_tx_work(struct work_struct *w) { struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work); struct sock *sk = &kcm->sk; int err; lock_sock(sk); /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx * aborts */ err = kcm_write_msgs(kcm); if (err < 0) { /* Hard failure in write, report error on KCM socket */ pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err); report_csk_error(&kcm->sk, -err); goto out; } /* Primarily for SOCK_SEQPACKET sockets */ if (likely(sk->sk_socket) && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_space(sk); } out: release_sock(sk); } static void kcm_push(struct kcm_sock *kcm) { if (kcm->tx_wait_more) kcm_write_msgs(kcm); } static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); struct sk_buff *skb = NULL, *head = NULL; size_t copy, copied = 0; long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); int eor = (sock->type == SOCK_DGRAM) ? !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); int err = -EPIPE; mutex_lock(&kcm->tx_mutex); lock_sock(sk); /* Per tcp_sendmsg this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err) goto out_error; if (kcm->seq_skb) { /* Previously opened message */ head = kcm->seq_skb; skb = kcm_tx_msg(head)->last_skb; goto start; } /* Call the sk_stream functions to manage the sndbuf mem. */ if (!sk_stream_memory_free(sk)) { kcm_push(kcm); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = sk_stream_wait_memory(sk, &timeo); if (err) goto out_error; } if (msg_data_left(msg)) { /* New message, alloc head skb */ head = alloc_skb(0, sk->sk_allocation); while (!head) { kcm_push(kcm); err = sk_stream_wait_memory(sk, &timeo); if (err) goto out_error; head = alloc_skb(0, sk->sk_allocation); } skb = head; /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling * csum_and_copy_from_iter from skb_do_copy_data_nocache. */ skb->ip_summed = CHECKSUM_UNNECESSARY; } start: while (msg_data_left(msg)) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) goto wait_for_memory; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { if (i == MAX_SKB_FRAGS) { struct sk_buff *tskb; tskb = alloc_skb(0, sk->sk_allocation); if (!tskb) goto wait_for_memory; if (head == skb) skb_shinfo(head)->frag_list = tskb; else skb->next = tskb; skb = tskb; skb->ip_summed = CHECKSUM_UNNECESSARY; continue; } merge = false; } if (msg->msg_flags & MSG_SPLICE_PAGES) { copy = msg_data_left(msg); if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; err = skb_splice_from_iter(skb, &msg->msg_iter, copy); if (err < 0) { if (err == -EMSGSIZE) goto wait_for_memory; goto out_error; } copy = err; skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); if (head != skb) head->truesize += copy; } else { copy = min_t(int, msg_data_left(msg), pfrag->size - pfrag->offset); if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, pfrag->offset, copy); if (err) goto out_error; /* Update the skb. */ if (merge) { skb_frag_size_add( &skb_shinfo(skb)->frags[i - 1], copy); } else { skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy); get_page(pfrag->page); } pfrag->offset += copy; } copied += copy; if (head != skb) { head->len += copy; head->data_len += copy; } continue; wait_for_memory: kcm_push(kcm); err = sk_stream_wait_memory(sk, &timeo); if (err) goto out_error; } if (eor) { bool not_busy = skb_queue_empty(&sk->sk_write_queue); if (head) { /* Message complete, queue it on send buffer */ __skb_queue_tail(&sk->sk_write_queue, head); kcm->seq_skb = NULL; KCM_STATS_INCR(kcm->stats.tx_msgs); } if (msg->msg_flags & MSG_BATCH) { kcm->tx_wait_more = true; } else if (kcm->tx_wait_more || not_busy) { err = kcm_write_msgs(kcm); if (err < 0) { /* We got a hard error in write_msgs but have * already queued this message. Report an error * in the socket, but don't affect return value * from sendmsg */ pr_warn("KCM: Hard failure on kcm_write_msgs\n"); report_csk_error(&kcm->sk, -err); } } } else { /* Message not complete, save state */ partial_message: if (head) { kcm->seq_skb = head; kcm_tx_msg(head)->last_skb = skb; } } KCM_STATS_ADD(kcm->stats.tx_bytes, copied); release_sock(sk); mutex_unlock(&kcm->tx_mutex); return copied; out_error: kcm_push(kcm); if (sock->type == SOCK_SEQPACKET) { /* Wrote some bytes before encountering an * error, return partial success. */ if (copied) goto partial_message; if (head != kcm->seq_skb) kfree_skb(head); } else { kfree_skb(head); kcm->seq_skb = NULL; } err = sk_stream_error(sk, msg->msg_flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) sk->sk_write_space(sk); release_sock(sk); mutex_unlock(&kcm->tx_mutex); return err; } static void kcm_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); if (skb_queue_empty_lockless(&sk->sk_write_queue)) return; lock_sock(sk); kcm_write_msgs(kcm); release_sock(sk); } static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); int err = 0; struct strp_msg *stm; int copied = 0; struct sk_buff *skb; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; /* Okay, have a message on the receive queue */ stm = strp_msg(skb); if (len > stm->full_len) len = stm->full_len; err = skb_copy_datagram_msg(skb, stm->offset, msg, len); if (err < 0) goto out; copied = len; if (likely(!(flags & MSG_PEEK))) { KCM_STATS_ADD(kcm->stats.rx_bytes, copied); if (copied < stm->full_len) { if (sock->type == SOCK_DGRAM) { /* Truncated message */ msg->msg_flags |= MSG_TRUNC; goto msg_finished; } stm->offset += copied; stm->full_len -= copied; } else { msg_finished: /* Finished with message */ msg->msg_flags |= MSG_EOR; KCM_STATS_INCR(kcm->stats.rx_msgs); } } out: skb_free_datagram(sk, skb); return copied ? : err; } static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); struct strp_msg *stm; int err = 0; ssize_t copied; struct sk_buff *skb; if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK) flags = MSG_DONTWAIT; else flags = 0; /* Only support splice for SOCKSEQPACKET */ skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto err_out; /* Okay, have a message on the receive queue */ stm = strp_msg(skb); if (len > stm->full_len) len = stm->full_len; copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags); if (copied < 0) { err = copied; goto err_out; } KCM_STATS_ADD(kcm->stats.rx_bytes, copied); stm->offset += copied; stm->full_len -= copied; /* We have no way to return MSG_EOR. If all the bytes have been * read we still leave the message in the receive socket buffer. * A subsequent recvmsg needs to be done to return MSG_EOR and * finish reading the message. */ skb_free_datagram(sk, skb); return copied; err_out: skb_free_datagram(sk, skb); return err; } /* kcm sock lock held */ static void kcm_recv_disable(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; if (kcm->rx_disabled) return; spin_lock_bh(&mux->rx_lock); kcm->rx_disabled = 1; /* If a psock is reserved we'll do cleanup in unreserve */ if (!kcm->rx_psock) { if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); } requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); } spin_unlock_bh(&mux->rx_lock); } /* kcm sock lock held */ static void kcm_recv_enable(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; if (!kcm->rx_disabled) return; spin_lock_bh(&mux->rx_lock); kcm->rx_disabled = 0; kcm_rcv_ready(kcm); spin_unlock_bh(&mux->rx_lock); } static int kcm_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct kcm_sock *kcm = kcm_sk(sock->sk); int val, valbool; int err = 0; if (level != SOL_KCM) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; valbool = val ? 1 : 0; switch (optname) { case KCM_RECV_DISABLE: lock_sock(&kcm->sk); if (valbool) kcm_recv_disable(kcm); else kcm_recv_enable(kcm); release_sock(&kcm->sk); break; default: err = -ENOPROTOOPT; } return err; } static int kcm_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct kcm_sock *kcm = kcm_sk(sock->sk); int val, len; if (level != SOL_KCM) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case KCM_RECV_DISABLE: val = kcm->rx_disabled; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) { struct kcm_sock *tkcm; struct list_head *head; int index = 0; /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so * we set sk_state, otherwise epoll_wait always returns right away with * EPOLLHUP */ kcm->sk.sk_state = TCP_ESTABLISHED; /* Add to mux's kcm sockets list */ kcm->mux = mux; spin_lock_bh(&mux->lock); head = &mux->kcm_socks; list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) { if (tkcm->index != index) break; head = &tkcm->kcm_sock_list; index++; } list_add(&kcm->kcm_sock_list, head); kcm->index = index; mux->kcm_socks_cnt++; spin_unlock_bh(&mux->lock); INIT_WORK(&kcm->tx_work, kcm_tx_work); mutex_init(&kcm->tx_mutex); spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); spin_unlock_bh(&mux->rx_lock); } static int kcm_attach(struct socket *sock, struct socket *csock, struct bpf_prog *prog) { struct kcm_sock *kcm = kcm_sk(sock->sk); struct kcm_mux *mux = kcm->mux; struct sock *csk; struct kcm_psock *psock = NULL, *tpsock; struct list_head *head; int index = 0; static const struct strp_callbacks cb = { .rcv_msg = kcm_rcv_strparser, .parse_msg = kcm_parse_func_strparser, .read_sock_done = kcm_read_sock_done, }; int err = 0; csk = csock->sk; if (!csk) return -EINVAL; lock_sock(csk); /* Only allow TCP sockets to be attached for now */ if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) || csk->sk_protocol != IPPROTO_TCP) { err = -EOPNOTSUPP; goto out; } /* Don't allow listeners or closed sockets */ if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) { err = -EOPNOTSUPP; goto out; } psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); if (!psock) { err = -ENOMEM; goto out; } psock->mux = mux; psock->sk = csk; psock->bpf_prog = prog; write_lock_bh(&csk->sk_callback_lock); /* Check if sk_user_data is already by KCM or someone else. * Must be done under lock to prevent race conditions. */ if (csk->sk_user_data) { write_unlock_bh(&csk->sk_callback_lock); kmem_cache_free(kcm_psockp, psock); err = -EALREADY; goto out; } err = strp_init(&psock->strp, csk, &cb); if (err) { write_unlock_bh(&csk->sk_callback_lock); kmem_cache_free(kcm_psockp, psock); goto out; } psock->save_data_ready = csk->sk_data_ready; psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; csk->sk_user_data = psock; csk->sk_data_ready = psock_data_ready; csk->sk_write_space = psock_write_space; csk->sk_state_change = psock_state_change; write_unlock_bh(&csk->sk_callback_lock); sock_hold(csk); /* Finished initialization, now add the psock to the MUX. */ spin_lock_bh(&mux->lock); head = &mux->psocks; list_for_each_entry(tpsock, &mux->psocks, psock_list) { if (tpsock->index != index) break; head = &tpsock->psock_list; index++; } list_add(&psock->psock_list, head); psock->index = index; KCM_STATS_INCR(mux->stats.psock_attach); mux->psocks_cnt++; psock_now_avail(psock); spin_unlock_bh(&mux->lock); /* Schedule RX work in case there are already bytes queued */ strp_check_rcv(&psock->strp); out: release_sock(csk); return err; } static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) { struct socket *csock; struct bpf_prog *prog; int err; csock = sockfd_lookup(info->fd, &err); if (!csock) return -ENOENT; prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out; } err = kcm_attach(sock, csock, prog); if (err) { bpf_prog_put(prog); goto out; } /* Keep reference on file also */ return 0; out: sockfd_put(csock); return err; } static void kcm_unattach(struct kcm_psock *psock) { struct sock *csk = psock->sk; struct kcm_mux *mux = psock->mux; lock_sock(csk); /* Stop getting callbacks from TCP socket. After this there should * be no way to reserve a kcm for this psock. */ write_lock_bh(&csk->sk_callback_lock); csk->sk_user_data = NULL; csk->sk_data_ready = psock->save_data_ready; csk->sk_write_space = psock->save_write_space; csk->sk_state_change = psock->save_state_change; strp_stop(&psock->strp); if (WARN_ON(psock->rx_kcm)) { write_unlock_bh(&csk->sk_callback_lock); release_sock(csk); return; } spin_lock_bh(&mux->rx_lock); /* Stop receiver activities. After this point psock should not be * able to get onto ready list either through callbacks or work. */ if (psock->ready_rx_msg) { list_del(&psock->psock_ready_list); kfree_skb(psock->ready_rx_msg); psock->ready_rx_msg = NULL; KCM_STATS_INCR(mux->stats.rx_ready_drops); } spin_unlock_bh(&mux->rx_lock); write_unlock_bh(&csk->sk_callback_lock); /* Call strp_done without sock lock */ release_sock(csk); strp_done(&psock->strp); lock_sock(csk); bpf_prog_put(psock->bpf_prog); spin_lock_bh(&mux->lock); aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); save_strp_stats(&psock->strp, &mux->aggregate_strp_stats); KCM_STATS_INCR(mux->stats.psock_unattach); if (psock->tx_kcm) { /* psock was reserved. Just mark it finished and we will clean * up in the kcm paths, we need kcm lock which can not be * acquired here. */ KCM_STATS_INCR(mux->stats.psock_unattach_rsvd); spin_unlock_bh(&mux->lock); /* We are unattaching a socket that is reserved. Abort the * socket since we may be out of sync in sending on it. We need * to do this without the mux lock. */ kcm_abort_tx_psock(psock, EPIPE, false); spin_lock_bh(&mux->lock); if (!psock->tx_kcm) { /* psock now unreserved in window mux was unlocked */ goto no_reserved; } psock->done = 1; /* Commit done before queuing work to process it */ smp_mb(); /* Queue tx work to make sure psock->done is handled */ queue_work(kcm_wq, &psock->tx_kcm->tx_work); spin_unlock_bh(&mux->lock); } else { no_reserved: if (!psock->tx_stopped) list_del(&psock->psock_avail_list); list_del(&psock->psock_list); mux->psocks_cnt--; spin_unlock_bh(&mux->lock); sock_put(csk); fput(csk->sk_socket->file); kmem_cache_free(kcm_psockp, psock); } release_sock(csk); } static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) { struct kcm_sock *kcm = kcm_sk(sock->sk); struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; struct socket *csock; struct sock *csk; int err; csock = sockfd_lookup(info->fd, &err); if (!csock) return -ENOENT; csk = csock->sk; if (!csk) { err = -EINVAL; goto out; } err = -ENOENT; spin_lock_bh(&mux->lock); list_for_each_entry(psock, &mux->psocks, psock_list) { if (psock->sk != csk) continue; /* Found the matching psock */ if (psock->unattaching || WARN_ON(psock->done)) { err = -EALREADY; break; } psock->unattaching = 1; spin_unlock_bh(&mux->lock); /* Lower socket lock should already be held */ kcm_unattach(psock); err = 0; goto out; } spin_unlock_bh(&mux->lock); out: sockfd_put(csock); return err; } static struct proto kcm_proto = { .name = "KCM", .owner = THIS_MODULE, .obj_size = sizeof(struct kcm_sock), }; /* Clone a kcm socket. */ static struct file *kcm_clone(struct socket *osock) { struct socket *newsock; struct sock *newsk; newsock = sock_alloc(); if (!newsock) return ERR_PTR(-ENFILE); newsock->type = osock->type; newsock->ops = osock->ops; __module_get(newsock->ops->owner); newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, &kcm_proto, false); if (!newsk) { sock_release(newsock); return ERR_PTR(-ENOMEM); } sock_init_data(newsock, newsk); init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); } static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err; switch (cmd) { case SIOCKCMATTACH: { struct kcm_attach info; if (copy_from_user(&info, (void __user *)arg, sizeof(info))) return -EFAULT; err = kcm_attach_ioctl(sock, &info); break; } case SIOCKCMUNATTACH: { struct kcm_unattach info; if (copy_from_user(&info, (void __user *)arg, sizeof(info))) return -EFAULT; err = kcm_unattach_ioctl(sock, &info); break; } case SIOCKCMCLONE: { struct kcm_clone info; FD_PREPARE(fdf, 0, kcm_clone(sock)); if (fdf.err) return fdf.err; info.fd = fd_prepare_fd(fdf); if (copy_to_user((void __user *)arg, &info, sizeof(info))) return -EFAULT; fd_publish(fdf); err = 0; break; } default: err = -ENOIOCTLCMD; break; } return err; } static void release_mux(struct kcm_mux *mux) { struct kcm_net *knet = mux->knet; struct kcm_psock *psock, *tmp_psock; /* Release psocks */ list_for_each_entry_safe(psock, tmp_psock, &mux->psocks, psock_list) { if (!WARN_ON(psock->unattaching)) kcm_unattach(psock); } if (WARN_ON(mux->psocks_cnt)) return; __skb_queue_purge(&mux->rx_hold_queue); mutex_lock(&knet->mutex); aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &knet->aggregate_psock_stats); aggregate_strp_stats(&mux->aggregate_strp_stats, &knet->aggregate_strp_stats); list_del_rcu(&mux->kcm_mux_list); knet->count--; mutex_unlock(&knet->mutex); kfree_rcu(mux, rcu); } static void kcm_done(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct sock *sk = &kcm->sk; int socks_cnt; spin_lock_bh(&mux->rx_lock); if (kcm->rx_psock) { /* Cleanup in unreserve_rx_kcm */ WARN_ON(kcm->done); kcm->rx_disabled = 1; kcm->done = 1; spin_unlock_bh(&mux->rx_lock); return; } if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); } /* Move any pending receive messages to other kcm sockets */ requeue_rx_msgs(mux, &sk->sk_receive_queue); spin_unlock_bh(&mux->rx_lock); if (WARN_ON(sk_rmem_alloc_get(sk))) return; /* Detach from MUX */ spin_lock_bh(&mux->lock); list_del(&kcm->kcm_sock_list); mux->kcm_socks_cnt--; socks_cnt = mux->kcm_socks_cnt; spin_unlock_bh(&mux->lock); if (!socks_cnt) { /* We are done with the mux now. */ release_mux(mux); } WARN_ON(kcm->rx_wait); sock_put(&kcm->sk); } /* Called by kcm_release to close a KCM socket. * If this is the last KCM socket on the MUX, destroy the MUX. */ static int kcm_release(struct socket *sock) { struct sock *sk = sock->sk; struct kcm_sock *kcm; struct kcm_mux *mux; struct kcm_psock *psock; if (!sk) return 0; kcm = kcm_sk(sk); mux = kcm->mux; lock_sock(sk); sock_orphan(sk); kfree_skb(kcm->seq_skb); /* Purge queue under lock to avoid race condition with tx_work trying * to act when queue is nonempty. If tx_work runs after this point * it will just return. */ __skb_queue_purge(&sk->sk_write_queue); release_sock(sk); spin_lock_bh(&mux->lock); if (kcm->tx_wait) { /* Take of tx_wait list, after this point there should be no way * that a psock will be assigned to this kcm. */ list_del(&kcm->wait_psock_list); kcm->tx_wait = false; } spin_unlock_bh(&mux->lock); /* Cancel work. After this point there should be no outside references * to the kcm socket. */ disable_work_sync(&kcm->tx_work); lock_sock(sk); psock = kcm->tx_psock; if (psock) { /* A psock was reserved, so we need to kill it since it * may already have some bytes queued from a message. We * need to do this after removing kcm from tx_wait list. */ kcm_abort_tx_psock(psock, EPIPE, false); unreserve_psock(kcm); } release_sock(sk); WARN_ON(kcm->tx_wait); WARN_ON(kcm->tx_psock); sock->sk = NULL; kcm_done(kcm); return 0; } static const struct proto_ops kcm_dgram_ops = { .family = PF_KCM, .owner = THIS_MODULE, .release = kcm_release, .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = kcm_setsockopt, .getsockopt = kcm_getsockopt, .sendmsg = kcm_sendmsg, .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, .splice_eof = kcm_splice_eof, }; static const struct proto_ops kcm_seqpacket_ops = { .family = PF_KCM, .owner = THIS_MODULE, .release = kcm_release, .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = kcm_setsockopt, .getsockopt = kcm_getsockopt, .sendmsg = kcm_sendmsg, .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, .splice_eof = kcm_splice_eof, .splice_read = kcm_splice_read, }; /* Create proto operation for kcm sockets */ static int kcm_create(struct net *net, struct socket *sock, int protocol, int kern) { struct kcm_net *knet = net_generic(net, kcm_net_id); struct sock *sk; struct kcm_mux *mux; switch (sock->type) { case SOCK_DGRAM: sock->ops = &kcm_dgram_ops; break; case SOCK_SEQPACKET: sock->ops = &kcm_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } if (protocol != KCMPROTO_CONNECTED) return -EPROTONOSUPPORT; sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern); if (!sk) return -ENOMEM; /* Allocate a kcm mux, shared between KCM sockets */ mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL); if (!mux) { sk_free(sk); return -ENOMEM; } spin_lock_init(&mux->lock); spin_lock_init(&mux->rx_lock); INIT_LIST_HEAD(&mux->kcm_socks); INIT_LIST_HEAD(&mux->kcm_rx_waiters); INIT_LIST_HEAD(&mux->kcm_tx_waiters); INIT_LIST_HEAD(&mux->psocks); INIT_LIST_HEAD(&mux->psocks_ready); INIT_LIST_HEAD(&mux->psocks_avail); mux->knet = knet; /* Add new MUX to list */ mutex_lock(&knet->mutex); list_add_rcu(&mux->kcm_mux_list, &knet->mux_list); knet->count++; mutex_unlock(&knet->mutex); skb_queue_head_init(&mux->rx_hold_queue); /* Init KCM socket */ sock_init_data(sock, sk); init_kcm_sock(kcm_sk(sk), mux); return 0; } static const struct net_proto_family kcm_family_ops = { .family = PF_KCM, .create = kcm_create, .owner = THIS_MODULE, }; static __net_init int kcm_init_net(struct net *net) { struct kcm_net *knet = net_generic(net, kcm_net_id); INIT_LIST_HEAD_RCU(&knet->mux_list); mutex_init(&knet->mutex); return 0; } static __net_exit void kcm_exit_net(struct net *net) { struct kcm_net *knet = net_generic(net, kcm_net_id); /* All KCM sockets should be closed at this point, which should mean * that all multiplexors and psocks have been destroyed. */ WARN_ON(!list_empty(&knet->mux_list)); mutex_destroy(&knet->mutex); } static struct pernet_operations kcm_net_ops = { .init = kcm_init_net, .exit = kcm_exit_net, .id = &kcm_net_id, .size = sizeof(struct kcm_net), }; static int __init kcm_init(void) { int err = -ENOMEM; kcm_muxp = KMEM_CACHE(kcm_mux, SLAB_HWCACHE_ALIGN); if (!kcm_muxp) goto fail; kcm_psockp = KMEM_CACHE(kcm_psock, SLAB_HWCACHE_ALIGN); if (!kcm_psockp) goto fail; kcm_wq = create_singlethread_workqueue("kkcmd"); if (!kcm_wq) goto fail; err = proto_register(&kcm_proto, 1); if (err) goto fail; err = register_pernet_device(&kcm_net_ops); if (err) goto net_ops_fail; err = sock_register(&kcm_family_ops); if (err) goto sock_register_fail; err = kcm_proc_init(); if (err) goto proc_init_fail; return 0; proc_init_fail: sock_unregister(PF_KCM); sock_register_fail: unregister_pernet_device(&kcm_net_ops); net_ops_fail: proto_unregister(&kcm_proto); fail: kmem_cache_destroy(kcm_muxp); kmem_cache_destroy(kcm_psockp); if (kcm_wq) destroy_workqueue(kcm_wq); return err; } static void __exit kcm_exit(void) { kcm_proc_exit(); sock_unregister(PF_KCM); unregister_pernet_device(&kcm_net_ops); proto_unregister(&kcm_proto); destroy_workqueue(kcm_wq); kmem_cache_destroy(kcm_muxp); kmem_cache_destroy(kcm_psockp); } module_init(kcm_init); module_exit(kcm_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("KCM (Kernel Connection Multiplexor) sockets"); MODULE_ALIAS_NETPROTO(PF_KCM);
13 10 3 8 4 6 5 1 4 1 1 6 5 1 6 6 5 5 5 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct pause_req_info { struct ethnl_req_info base; enum ethtool_mac_stats_src src; }; #define PAUSE_REQINFO(__req_base) \ container_of(__req_base, struct pause_req_info, base) struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; struct ethtool_pause_stats pausestat; }; #define PAUSE_REPDATA(__reply_base) \ container_of(__reply_base, struct pause_reply_data, base) const struct nla_policy ethnl_pause_get_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), [ETHTOOL_A_PAUSE_STATS_SRC] = NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; static int pause_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; struct pause_req_info *req_info = PAUSE_REQINFO(req_base); if (tb[ETHTOOL_A_PAUSE_STATS_SRC]) { if (!(req_base->flags & ETHTOOL_FLAG_STATS)) { NL_SET_ERR_MSG_MOD(extack, "ETHTOOL_FLAG_STATS must be set when requesting a source of stats"); return -EINVAL; } src = nla_get_u32(tb[ETHTOOL_A_PAUSE_STATS_SRC]); } req_info->src = src; return 0; } static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { const struct pause_req_info *req_info = PAUSE_REQINFO(req_base); struct pause_reply_data *data = PAUSE_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; ethtool_stats_init((u64 *)&data->pausestat, sizeof(data->pausestat) / 8); data->pausestat.src = src; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; } dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); if (req_base->flags & ETHTOOL_FLAG_STATS && dev->ethtool_ops->get_pause_stats) dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); ethnl_ops_complete(dev); return 0; } static int pause_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ nla_total_size(sizeof(u8)); /* _PAUSE_TX */ if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ nla_total_size(sizeof(u32)) + /* _PAUSE_STATS_SRC */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype, u16 padtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, padtype)) return -EMSGSIZE; return 0; } static int pause_put_stats(struct sk_buff *skb, const struct ethtool_pause_stats *pause_stats) { const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; struct nlattr *nest; if (nla_put_u32(skb, ETHTOOL_A_PAUSE_STATS_SRC, pause_stats->src)) return -EMSGSIZE; nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); if (!nest) return -EMSGSIZE; if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || ethtool_put_stat(skb, pause_stats->rx_pause_frames, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int pause_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pause_reply_data *data = PAUSE_REPDATA(reply_base); const struct ethtool_pauseparam *pauseparam = &data->pauseparam; if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) || nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) || nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) return -EMSGSIZE; if (req_base->flags & ETHTOOL_FLAG_STATS && pause_put_stats(skb, &data->pausestat)) return -EMSGSIZE; return 0; } /* PAUSE_SET */ const struct nla_policy ethnl_pause_set_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_STATS_SRC] = { .type = NLA_REJECT }, }; static int ethnl_set_pause_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_pauseparam && ops->set_pauseparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_pause(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct ethtool_pauseparam params = {}; struct nlattr **tb = info->attrs; bool mod = false; int ret; dev->ethtool_ops->get_pauseparam(dev, &params); ethnl_update_bool32(&params.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod); ethnl_update_bool32(&params.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod); ethnl_update_bool32(&params.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_pauseparam(dev, &params); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_pause_request_ops = { .request_cmd = ETHTOOL_MSG_PAUSE_GET, .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, .hdr_attr = ETHTOOL_A_PAUSE_HEADER, .req_info_size = sizeof(struct pause_req_info), .reply_data_size = sizeof(struct pause_reply_data), .parse_request = pause_parse_request, .prepare_data = pause_prepare_data, .reply_size = pause_reply_size, .fill_reply = pause_fill_reply, .set_validate = ethnl_set_pause_validate, .set = ethnl_set_pause, .set_ntf_cmd = ETHTOOL_MSG_PAUSE_NTF, };
36 36 36 45 45 45 10 10 7 2 1 1 2 2 1 1 35 35 36 36 7 1 2 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 // SPDX-License-Identifier: GPL-2.0-only /* * Vxlan vni filter for collect metadata mode * * Authors: Roopa Prabhu <roopa@nvidia.com> * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/etherdevice.h> #include <linux/rhashtable.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/vxlan.h> #include "vxlan_private.h" static inline int vxlan_vni_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct vxlan_vni_node *vnode = ptr; __be32 vni = *(__be32 *)arg->key; return vnode->vni != vni; } const struct rhashtable_params vxlan_vni_rht_params = { .head_offset = offsetof(struct vxlan_vni_node, vnode), .key_offset = offsetof(struct vxlan_vni_node, vni), .key_len = sizeof(__be32), .nelem_hint = 3, .max_size = VXLAN_N_VID, .obj_cmpfn = vxlan_vni_cmp, .automatic_shrinking = true, }; static void vxlan_vs_add_del_vninode(struct vxlan_dev *vxlan, struct vxlan_vni_node *v, bool del) { struct vxlan_dev_node *node; struct vxlan_sock *vs; ASSERT_RTNL(); if (del) { if (!hlist_unhashed(&v->hlist4.hlist)) hlist_del_init_rcu(&v->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) if (!hlist_unhashed(&v->hlist6.hlist)) hlist_del_init_rcu(&v->hlist6.hlist); #endif return; } #if IS_ENABLED(CONFIG_IPV6) vs = rtnl_dereference(vxlan->vn6_sock); if (vs && v) { node = &v->hlist6; hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni)); } #endif vs = rtnl_dereference(vxlan->vn4_sock); if (vs && v) { node = &v->hlist4; hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni)); } } void vxlan_vs_add_vnigrp(struct vxlan_dev *vxlan, struct vxlan_sock *vs, bool ipv6) { struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); struct vxlan_vni_node *v, *tmp; struct vxlan_dev_node *node; ASSERT_RTNL(); if (!vg) return; list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { #if IS_ENABLED(CONFIG_IPV6) if (ipv6) node = &v->hlist6; else #endif node = &v->hlist4; node->vxlan = vxlan; hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni)); } } void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan) { struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); struct vxlan_vni_node *v, *tmp; ASSERT_RTNL(); if (!vg) return; list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { hlist_del_init_rcu(&v->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&v->hlist6.hlist); #endif } } static void vxlan_vnifilter_stats_get(const struct vxlan_vni_node *vninode, struct vxlan_vni_stats *dest) { int i; memset(dest, 0, sizeof(*dest)); for_each_possible_cpu(i) { struct vxlan_vni_stats_pcpu *pstats; struct vxlan_vni_stats temp; unsigned int start; pstats = per_cpu_ptr(vninode->stats, i); do { start = u64_stats_fetch_begin(&pstats->syncp); memcpy(&temp, &pstats->stats, sizeof(temp)); } while (u64_stats_fetch_retry(&pstats->syncp, start)); dest->rx_packets += temp.rx_packets; dest->rx_bytes += temp.rx_bytes; dest->rx_drops += temp.rx_drops; dest->rx_errors += temp.rx_errors; dest->tx_packets += temp.tx_packets; dest->tx_bytes += temp.tx_bytes; dest->tx_drops += temp.tx_drops; dest->tx_errors += temp.tx_errors; } } static void vxlan_vnifilter_stats_add(struct vxlan_vni_node *vninode, int type, unsigned int len) { struct vxlan_vni_stats_pcpu *pstats = this_cpu_ptr(vninode->stats); u64_stats_update_begin(&pstats->syncp); switch (type) { case VXLAN_VNI_STATS_RX: pstats->stats.rx_bytes += len; pstats->stats.rx_packets++; break; case VXLAN_VNI_STATS_RX_DROPS: pstats->stats.rx_drops++; break; case VXLAN_VNI_STATS_RX_ERRORS: pstats->stats.rx_errors++; break; case VXLAN_VNI_STATS_TX: pstats->stats.tx_bytes += len; pstats->stats.tx_packets++; break; case VXLAN_VNI_STATS_TX_DROPS: pstats->stats.tx_drops++; break; case VXLAN_VNI_STATS_TX_ERRORS: pstats->stats.tx_errors++; break; } u64_stats_update_end(&pstats->syncp); } void vxlan_vnifilter_count(struct vxlan_dev *vxlan, __be32 vni, struct vxlan_vni_node *vninode, int type, unsigned int len) { struct vxlan_vni_node *vnode; if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)) return; if (vninode) { vnode = vninode; } else { vnode = vxlan_vnifilter_lookup(vxlan, vni); if (!vnode) return; } vxlan_vnifilter_stats_add(vnode, type, len); } static u32 vnirange(struct vxlan_vni_node *vbegin, struct vxlan_vni_node *vend) { return (be32_to_cpu(vend->vni) - be32_to_cpu(vbegin->vni)); } static size_t vxlan_vnifilter_entry_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct tunnel_msg)) + nla_total_size(0) /* VXLAN_VNIFILTER_ENTRY */ + nla_total_size(sizeof(u32)) /* VXLAN_VNIFILTER_ENTRY_START */ + nla_total_size(sizeof(u32)) /* VXLAN_VNIFILTER_ENTRY_END */ + nla_total_size(sizeof(struct in6_addr));/* VXLAN_VNIFILTER_ENTRY_GROUP{6} */ } static int __vnifilter_entry_fill_stats(struct sk_buff *skb, const struct vxlan_vni_node *vbegin) { struct vxlan_vni_stats vstats; struct nlattr *vstats_attr; vstats_attr = nla_nest_start(skb, VXLAN_VNIFILTER_ENTRY_STATS); if (!vstats_attr) goto out_stats_err; vxlan_vnifilter_stats_get(vbegin, &vstats); if (nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_BYTES, vstats.rx_bytes, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_PKTS, vstats.rx_packets, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_DROPS, vstats.rx_drops, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_ERRORS, vstats.rx_errors, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_BYTES, vstats.tx_bytes, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_PKTS, vstats.tx_packets, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_DROPS, vstats.tx_drops, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_ERRORS, vstats.tx_errors, VNIFILTER_ENTRY_STATS_PAD)) goto out_stats_err; nla_nest_end(skb, vstats_attr); return 0; out_stats_err: nla_nest_cancel(skb, vstats_attr); return -EMSGSIZE; } static bool vxlan_fill_vni_filter_entry(struct sk_buff *skb, struct vxlan_vni_node *vbegin, struct vxlan_vni_node *vend, bool fill_stats) { struct nlattr *ventry; u32 vs = be32_to_cpu(vbegin->vni); u32 ve = 0; if (vbegin != vend) ve = be32_to_cpu(vend->vni); ventry = nla_nest_start(skb, VXLAN_VNIFILTER_ENTRY); if (!ventry) return false; if (nla_put_u32(skb, VXLAN_VNIFILTER_ENTRY_START, vs)) goto out_err; if (ve && nla_put_u32(skb, VXLAN_VNIFILTER_ENTRY_END, ve)) goto out_err; if (!vxlan_addr_any(&vbegin->remote_ip)) { if (vbegin->remote_ip.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, VXLAN_VNIFILTER_ENTRY_GROUP, vbegin->remote_ip.sin.sin_addr.s_addr)) goto out_err; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, VXLAN_VNIFILTER_ENTRY_GROUP6, &vbegin->remote_ip.sin6.sin6_addr)) goto out_err; #endif } } if (fill_stats && __vnifilter_entry_fill_stats(skb, vbegin)) goto out_err; nla_nest_end(skb, ventry); return true; out_err: nla_nest_cancel(skb, ventry); return false; } static void vxlan_vnifilter_notify(const struct vxlan_dev *vxlan, struct vxlan_vni_node *vninode, int cmd) { struct tunnel_msg *tmsg; struct sk_buff *skb; struct nlmsghdr *nlh; struct net *net = dev_net(vxlan->dev); int err = -ENOBUFS; skb = nlmsg_new(vxlan_vnifilter_entry_nlmsg_size(), GFP_KERNEL); if (!skb) goto out_err; err = -EMSGSIZE; nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*tmsg), 0); if (!nlh) goto out_err; tmsg = nlmsg_data(nlh); memset(tmsg, 0, sizeof(*tmsg)); tmsg->family = AF_BRIDGE; tmsg->ifindex = vxlan->dev->ifindex; if (!vxlan_fill_vni_filter_entry(skb, vninode, vninode, false)) goto out_err; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_TUNNEL, NULL, GFP_KERNEL); return; out_err: rtnl_set_sk_err(net, RTNLGRP_TUNNEL, err); kfree_skb(skb); } static int vxlan_vnifilter_dump_dev(const struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct vxlan_vni_node *tmp, *v, *vbegin = NULL, *vend = NULL; struct vxlan_dev *vxlan = netdev_priv(dev); struct tunnel_msg *new_tmsg, *tmsg; int idx = 0, s_idx = cb->args[1]; struct vxlan_vni_group *vg; struct nlmsghdr *nlh; bool dump_stats; int err = 0; if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)) return -EINVAL; /* RCU needed because of the vni locking rules (rcu || rtnl) */ vg = rcu_dereference(vxlan->vnigrp); if (!vg || !vg->num_vnis) return 0; tmsg = nlmsg_data(cb->nlh); dump_stats = !!(tmsg->flags & TUNNEL_MSG_FLAG_STATS); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWTUNNEL, sizeof(*new_tmsg), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; new_tmsg = nlmsg_data(nlh); memset(new_tmsg, 0, sizeof(*new_tmsg)); new_tmsg->family = PF_BRIDGE; new_tmsg->ifindex = dev->ifindex; list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { if (idx < s_idx) { idx++; continue; } if (!vbegin) { vbegin = v; vend = v; continue; } if (!dump_stats && vnirange(vend, v) == 1 && vxlan_addr_equal(&v->remote_ip, &vend->remote_ip)) { goto update_end; } else { if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend, dump_stats)) { err = -EMSGSIZE; break; } idx += vnirange(vbegin, vend) + 1; vbegin = v; } update_end: vend = v; } if (!err && vbegin) { if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend, dump_stats)) err = -EMSGSIZE; } cb->args[1] = err ? idx : 0; nlmsg_end(skb, nlh); return err; } static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0, err = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); struct tunnel_msg *tmsg; struct net_device *dev; tmsg = nlmsg_payload(cb->nlh, sizeof(*tmsg)); if (!tmsg) { NL_SET_ERR_MSG(cb->extack, "Invalid msg length"); return -EINVAL; } if (tmsg->flags & ~TUNNEL_MSG_VALID_USER_FLAGS) { NL_SET_ERR_MSG(cb->extack, "Invalid tunnelmsg flags in ancillary header"); return -EINVAL; } rcu_read_lock(); if (tmsg->ifindex) { dev = dev_get_by_index_rcu(net, tmsg->ifindex); if (!dev) { err = -ENODEV; goto out_err; } if (!netif_is_vxlan(dev)) { NL_SET_ERR_MSG(cb->extack, "The device is not a vxlan device"); err = -EINVAL; goto out_err; } err = vxlan_vnifilter_dump_dev(dev, skb, cb); /* if the dump completed without an error we return 0 here */ if (err != -EMSGSIZE) goto out_err; } else { for_each_netdev_rcu(net, dev) { if (!netif_is_vxlan(dev)) continue; if (idx < s_idx) goto skip; err = vxlan_vnifilter_dump_dev(dev, skb, cb); if (err == -EMSGSIZE) break; skip: idx++; } } cb->args[0] = idx; rcu_read_unlock(); return skb->len; out_err: rcu_read_unlock(); return err; } static const struct nla_policy vni_filter_entry_policy[VXLAN_VNIFILTER_ENTRY_MAX + 1] = { [VXLAN_VNIFILTER_ENTRY_START] = { .type = NLA_U32 }, [VXLAN_VNIFILTER_ENTRY_END] = { .type = NLA_U32 }, [VXLAN_VNIFILTER_ENTRY_GROUP] = { .type = NLA_BINARY, .len = sizeof_field(struct iphdr, daddr) }, [VXLAN_VNIFILTER_ENTRY_GROUP6] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, }; static const struct nla_policy vni_filter_policy[VXLAN_VNIFILTER_MAX + 1] = { [VXLAN_VNIFILTER_ENTRY] = { .type = NLA_NESTED }, }; static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, union vxlan_addr *old_remote_ip, union vxlan_addr *remote_ip, struct netlink_ext_ack *extack) { struct vxlan_rdst *dst = &vxlan->default_dst; int err = 0; spin_lock_bh(&vxlan->hash_lock); if (remote_ip && !vxlan_addr_any(remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, remote_ip, NUD_REACHABLE | NUD_PERMANENT, NLM_F_APPEND | NLM_F_CREATE, vxlan->cfg.dst_port, vni, vni, dst->remote_ifindex, NTF_SELF, 0, true, extack); if (err) { spin_unlock_bh(&vxlan->hash_lock); return err; } } if (old_remote_ip && !vxlan_addr_any(old_remote_ip)) { __vxlan_fdb_delete(vxlan, all_zeros_mac, *old_remote_ip, vxlan->cfg.dst_port, vni, vni, dst->remote_ifindex, true); } spin_unlock_bh(&vxlan->hash_lock); return err; } static int vxlan_vni_update_group(struct vxlan_dev *vxlan, struct vxlan_vni_node *vninode, union vxlan_addr *group, bool create, bool *changed, struct netlink_ext_ack *extack) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); struct vxlan_rdst *dst = &vxlan->default_dst; union vxlan_addr *newrip = NULL, *oldrip = NULL; union vxlan_addr old_remote_ip; int ret = 0; memcpy(&old_remote_ip, &vninode->remote_ip, sizeof(old_remote_ip)); /* if per vni remote ip is not present use vxlan dev * default dst remote ip for fdb entry */ if (group && !vxlan_addr_any(group)) { newrip = group; } else { if (!vxlan_addr_any(&dst->remote_ip)) newrip = &dst->remote_ip; } /* if old rip exists, and no newrip, * explicitly delete old rip */ if (!newrip && !vxlan_addr_any(&old_remote_ip)) oldrip = &old_remote_ip; if (!newrip && !oldrip) return 0; if (!create && oldrip && newrip && vxlan_addr_equal(oldrip, newrip)) return 0; ret = vxlan_update_default_fdb_entry(vxlan, vninode->vni, oldrip, newrip, extack); if (ret) goto out; if (group) memcpy(&vninode->remote_ip, group, sizeof(vninode->remote_ip)); if (vxlan->dev->flags & IFF_UP) { if (vxlan_addr_multicast(&old_remote_ip) && !vxlan_group_used(vn, vxlan, vninode->vni, &old_remote_ip, vxlan->default_dst.remote_ifindex)) { ret = vxlan_igmp_leave(vxlan, &old_remote_ip, 0); if (ret) goto out; } if (vxlan_addr_multicast(&vninode->remote_ip)) { ret = vxlan_igmp_join(vxlan, &vninode->remote_ip, 0); if (ret == -EADDRINUSE) ret = 0; if (ret) goto out; } } *changed = true; return 0; out: return ret; } int vxlan_vnilist_update_group(struct vxlan_dev *vxlan, union vxlan_addr *old_remote_ip, union vxlan_addr *new_remote_ip, struct netlink_ext_ack *extack) { struct list_head *headp, *hpos; struct vxlan_vni_group *vg; struct vxlan_vni_node *vent; int ret; vg = rtnl_dereference(vxlan->vnigrp); headp = &vg->vni_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct vxlan_vni_node, vlist); if (vxlan_addr_any(&vent->remote_ip)) { ret = vxlan_update_default_fdb_entry(vxlan, vent->vni, old_remote_ip, new_remote_ip, extack); if (ret) return ret; } } return 0; } static void vxlan_vni_delete_group(struct vxlan_dev *vxlan, struct vxlan_vni_node *vninode) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); struct vxlan_rdst *dst = &vxlan->default_dst; /* if per vni remote_ip not present, delete the * default dst remote_ip previously added for this vni */ if (!vxlan_addr_any(&vninode->remote_ip) || !vxlan_addr_any(&dst->remote_ip)) { spin_lock_bh(&vxlan->hash_lock); __vxlan_fdb_delete(vxlan, all_zeros_mac, (vxlan_addr_any(&vninode->remote_ip) ? dst->remote_ip : vninode->remote_ip), vxlan->cfg.dst_port, vninode->vni, vninode->vni, dst->remote_ifindex, true); spin_unlock_bh(&vxlan->hash_lock); } if (vxlan->dev->flags & IFF_UP) { if (vxlan_addr_multicast(&vninode->remote_ip) && !vxlan_group_used(vn, vxlan, vninode->vni, &vninode->remote_ip, dst->remote_ifindex)) { vxlan_igmp_leave(vxlan, &vninode->remote_ip, 0); } } } static int vxlan_vni_update(struct vxlan_dev *vxlan, struct vxlan_vni_group *vg, __be32 vni, union vxlan_addr *group, bool *changed, struct netlink_ext_ack *extack) { struct vxlan_vni_node *vninode; int ret; vninode = rhashtable_lookup_fast(&vg->vni_hash, &vni, vxlan_vni_rht_params); if (!vninode) return 0; ret = vxlan_vni_update_group(vxlan, vninode, group, false, changed, extack); if (ret) return ret; if (changed) vxlan_vnifilter_notify(vxlan, vninode, RTM_NEWTUNNEL); return 0; } static void __vxlan_vni_add_list(struct vxlan_vni_group *vg, struct vxlan_vni_node *v) { struct list_head *headp, *hpos; struct vxlan_vni_node *vent; headp = &vg->vni_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct vxlan_vni_node, vlist); if (be32_to_cpu(v->vni) < be32_to_cpu(vent->vni)) continue; else break; } list_add_rcu(&v->vlist, hpos); vg->num_vnis++; } static void __vxlan_vni_del_list(struct vxlan_vni_group *vg, struct vxlan_vni_node *v) { list_del_rcu(&v->vlist); vg->num_vnis--; } static struct vxlan_vni_node *vxlan_vni_alloc(struct vxlan_dev *vxlan, __be32 vni) { struct vxlan_vni_node *vninode; vninode = kzalloc(sizeof(*vninode), GFP_KERNEL); if (!vninode) return NULL; vninode->stats = netdev_alloc_pcpu_stats(struct vxlan_vni_stats_pcpu); if (!vninode->stats) { kfree(vninode); return NULL; } vninode->vni = vni; vninode->hlist4.vxlan = vxlan; #if IS_ENABLED(CONFIG_IPV6) vninode->hlist6.vxlan = vxlan; #endif return vninode; } static void vxlan_vni_free(struct vxlan_vni_node *vninode) { free_percpu(vninode->stats); kfree(vninode); } static int vxlan_vni_add(struct vxlan_dev *vxlan, struct vxlan_vni_group *vg, u32 vni, union vxlan_addr *group, struct netlink_ext_ack *extack) { struct vxlan_vni_node *vninode; __be32 v = cpu_to_be32(vni); bool changed = false; int err = 0; if (vxlan_vnifilter_lookup(vxlan, v)) return vxlan_vni_update(vxlan, vg, v, group, &changed, extack); err = vxlan_vni_in_use(vxlan->net, vxlan, &vxlan->cfg, v); if (err) { NL_SET_ERR_MSG(extack, "VNI in use"); return err; } vninode = vxlan_vni_alloc(vxlan, v); if (!vninode) return -ENOMEM; err = rhashtable_lookup_insert_fast(&vg->vni_hash, &vninode->vnode, vxlan_vni_rht_params); if (err) { vxlan_vni_free(vninode); return err; } __vxlan_vni_add_list(vg, vninode); if (vxlan->dev->flags & IFF_UP) vxlan_vs_add_del_vninode(vxlan, vninode, false); err = vxlan_vni_update_group(vxlan, vninode, group, true, &changed, extack); if (changed) vxlan_vnifilter_notify(vxlan, vninode, RTM_NEWTUNNEL); return err; } static void vxlan_vni_node_rcu_free(struct rcu_head *rcu) { struct vxlan_vni_node *v; v = container_of(rcu, struct vxlan_vni_node, rcu); vxlan_vni_free(v); } static int vxlan_vni_del(struct vxlan_dev *vxlan, struct vxlan_vni_group *vg, u32 vni, struct netlink_ext_ack *extack) { struct vxlan_vni_node *vninode; __be32 v = cpu_to_be32(vni); int err = 0; vg = rtnl_dereference(vxlan->vnigrp); vninode = rhashtable_lookup_fast(&vg->vni_hash, &v, vxlan_vni_rht_params); if (!vninode) { err = -ENOENT; goto out; } vxlan_vni_delete_group(vxlan, vninode); err = rhashtable_remove_fast(&vg->vni_hash, &vninode->vnode, vxlan_vni_rht_params); if (err) goto out; __vxlan_vni_del_list(vg, vninode); vxlan_vnifilter_notify(vxlan, vninode, RTM_DELTUNNEL); if (vxlan->dev->flags & IFF_UP) vxlan_vs_add_del_vninode(vxlan, vninode, true); call_rcu(&vninode->rcu, vxlan_vni_node_rcu_free); return 0; out: return err; } static int vxlan_vni_add_del(struct vxlan_dev *vxlan, __u32 start_vni, __u32 end_vni, union vxlan_addr *group, int cmd, struct netlink_ext_ack *extack) { struct vxlan_vni_group *vg; int v, err = 0; vg = rtnl_dereference(vxlan->vnigrp); for (v = start_vni; v <= end_vni; v++) { switch (cmd) { case RTM_NEWTUNNEL: err = vxlan_vni_add(vxlan, vg, v, group, extack); break; case RTM_DELTUNNEL: err = vxlan_vni_del(vxlan, vg, v, extack); break; default: err = -EOPNOTSUPP; break; } if (err) goto out; } return 0; out: return err; } static int vxlan_process_vni_filter(struct vxlan_dev *vxlan, struct nlattr *nlvnifilter, int cmd, struct netlink_ext_ack *extack) { struct nlattr *vattrs[VXLAN_VNIFILTER_ENTRY_MAX + 1]; u32 vni_start = 0, vni_end = 0; union vxlan_addr group; int err; err = nla_parse_nested(vattrs, VXLAN_VNIFILTER_ENTRY_MAX, nlvnifilter, vni_filter_entry_policy, extack); if (err) return err; if (vattrs[VXLAN_VNIFILTER_ENTRY_START]) { vni_start = nla_get_u32(vattrs[VXLAN_VNIFILTER_ENTRY_START]); vni_end = vni_start; } if (vattrs[VXLAN_VNIFILTER_ENTRY_END]) vni_end = nla_get_u32(vattrs[VXLAN_VNIFILTER_ENTRY_END]); if (!vni_start && !vni_end) { NL_SET_ERR_MSG_ATTR(extack, nlvnifilter, "vni start nor end found in vni entry"); return -EINVAL; } if (vattrs[VXLAN_VNIFILTER_ENTRY_GROUP]) { group.sin.sin_addr.s_addr = nla_get_in_addr(vattrs[VXLAN_VNIFILTER_ENTRY_GROUP]); group.sa.sa_family = AF_INET; } else if (vattrs[VXLAN_VNIFILTER_ENTRY_GROUP6]) { group.sin6.sin6_addr = nla_get_in6_addr(vattrs[VXLAN_VNIFILTER_ENTRY_GROUP6]); group.sa.sa_family = AF_INET6; } else { memset(&group, 0, sizeof(group)); } if (vxlan_addr_multicast(&group) && !vxlan->default_dst.remote_ifindex) { NL_SET_ERR_MSG(extack, "Local interface required for multicast remote group"); return -EINVAL; } err = vxlan_vni_add_del(vxlan, vni_start, vni_end, &group, cmd, extack); if (err) return err; return 0; } void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan) { struct vxlan_vni_node *v, *tmp; struct vxlan_vni_group *vg; vg = rtnl_dereference(vxlan->vnigrp); list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { rhashtable_remove_fast(&vg->vni_hash, &v->vnode, vxlan_vni_rht_params); hlist_del_init_rcu(&v->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&v->hlist6.hlist); #endif __vxlan_vni_del_list(vg, v); vxlan_vnifilter_notify(vxlan, v, RTM_DELTUNNEL); call_rcu(&v->rcu, vxlan_vni_node_rcu_free); } rhashtable_destroy(&vg->vni_hash); kfree(vg); } int vxlan_vnigroup_init(struct vxlan_dev *vxlan) { struct vxlan_vni_group *vg; int ret; vg = kzalloc(sizeof(*vg), GFP_KERNEL); if (!vg) return -ENOMEM; ret = rhashtable_init(&vg->vni_hash, &vxlan_vni_rht_params); if (ret) { kfree(vg); return ret; } INIT_LIST_HEAD(&vg->vni_list); rcu_assign_pointer(vxlan->vnigrp, vg); return 0; } static int vxlan_vnifilter_process(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tunnel_msg *tmsg; struct vxlan_dev *vxlan; struct net_device *dev; struct nlattr *attr; int err, vnis = 0; int rem; /* this should validate the header and check for remaining bytes */ err = nlmsg_parse(nlh, sizeof(*tmsg), NULL, VXLAN_VNIFILTER_MAX, vni_filter_policy, extack); if (err < 0) return err; tmsg = nlmsg_data(nlh); dev = __dev_get_by_index(net, tmsg->ifindex); if (!dev) return -ENODEV; if (!netif_is_vxlan(dev)) { NL_SET_ERR_MSG_MOD(extack, "The device is not a vxlan device"); return -EINVAL; } vxlan = netdev_priv(dev); if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)) return -EOPNOTSUPP; nlmsg_for_each_attr_type(attr, VXLAN_VNIFILTER_ENTRY, nlh, sizeof(*tmsg), rem) { err = vxlan_process_vni_filter(vxlan, attr, nlh->nlmsg_type, extack); vnis++; if (err) break; } if (!vnis) { NL_SET_ERR_MSG_MOD(extack, "No vnis found to process"); err = -EINVAL; } return err; } static const struct rtnl_msg_handler vxlan_vnifilter_rtnl_msg_handlers[] = { {THIS_MODULE, PF_BRIDGE, RTM_GETTUNNEL, NULL, vxlan_vnifilter_dump, 0}, {THIS_MODULE, PF_BRIDGE, RTM_NEWTUNNEL, vxlan_vnifilter_process, NULL, 0}, {THIS_MODULE, PF_BRIDGE, RTM_DELTUNNEL, vxlan_vnifilter_process, NULL, 0}, }; int vxlan_vnifilter_init(void) { return rtnl_register_many(vxlan_vnifilter_rtnl_msg_handlers); } void vxlan_vnifilter_uninit(void) { rtnl_unregister_many(vxlan_vnifilter_rtnl_msg_handlers); }
2985 2984 2985 2984 2985 2984 2984 2985 2985 2981 2984 2983 2981 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 // SPDX-License-Identifier: GPL-2.0 #include <linux/static_call.h> #include <linux/memory.h> #include <linux/bug.h> #include <asm/text-patching.h> enum insn_type { CALL = 0, /* site call */ NOP = 1, /* site cond-call */ JMP = 2, /* tramp / site tail-call */ RET = 3, /* tramp / site cond-tail-call */ JCC = 4, }; /* * ud1 %esp, %ecx - a 3 byte #UD that is unique to trampolines, chosen such * that there is no false-positive trampoline identification while also being a * speculation stop. */ static const u8 tramp_ud[] = { 0x0f, 0xb9, 0xcc }; /* * cs cs cs xorl %eax, %eax - a single 5 byte instruction that clears %[er]ax */ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 }; static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc }; /* * ud1 (%edx),%rdi -- see __WARN_trap() / decode_bug() */ static const u8 warninsn[] = { 0x67, 0x48, 0x0f, 0xb9, 0x3a }; static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */ { u8 ret = 0; if (insn[0] == 0x0f) { u8 tmp = insn[1]; if ((tmp & 0xf0) == 0x80) ret = tmp; } return ret; } extern void __static_call_return(void); asm (".global __static_call_return\n\t" ".type __static_call_return, @function\n\t" ASM_FUNC_ALIGN "\n\t" "__static_call_return:\n\t" ANNOTATE_NOENDBR "\n\t" ANNOTATE_RETPOLINE_SAFE "\n\t" "ret; int3\n\t" ".size __static_call_return, . - __static_call_return \n\t"); static void __ref __static_call_transform(void *insn, enum insn_type type, void *func, bool modinit) { const void *emulate = NULL; int size = CALL_INSN_SIZE; const void *code; u8 op, buf[6]; if ((type == JMP || type == RET) && (op = __is_Jcc(insn))) type = JCC; switch (type) { case CALL: func = callthunks_translate_call_dest(func); code = text_gen_insn(CALL_INSN_OPCODE, insn, func); if (func == &__static_call_return0) { emulate = code; code = &xor5rax; } if (func == &__WARN_trap) { emulate = code; code = &warninsn; } break; case NOP: code = x86_nops[5]; break; case JMP: code = text_gen_insn(JMP32_INSN_OPCODE, insn, func); break; case RET: if (cpu_wants_rethunk_at(insn)) code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); else code = &retinsn; break; case JCC: if (!func) { func = __static_call_return; if (cpu_wants_rethunk()) func = x86_return_thunk; } buf[0] = 0x0f; __text_gen_insn(buf+1, op, insn+1, func, 5); code = buf; size = 6; break; } if (memcmp(insn, code, size) == 0) return; if (system_state == SYSTEM_BOOTING || modinit) return text_poke_early(insn, code, size); smp_text_poke_single(insn, code, size, emulate); } static void __static_call_validate(u8 *insn, bool tail, bool tramp) { u8 opcode = insn[0]; if (tramp && memcmp(insn+5, tramp_ud, 3)) { pr_err("trampoline signature fail"); BUG(); } if (tail) { if (opcode == JMP32_INSN_OPCODE || opcode == RET_INSN_OPCODE || __is_Jcc(insn)) return; } else { if (opcode == CALL_INSN_OPCODE || !memcmp(insn, x86_nops[5], 5) || !memcmp(insn, xor5rax, 5) || !memcmp(insn, warninsn, 5)) return; } /* * If we ever trigger this, our text is corrupt, we'll probably not live long. */ pr_err("unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn); BUG(); } static inline enum insn_type __sc_insn(bool null, bool tail) { /* * Encode the following table without branches: * * tail null insn * -----+-------+------ * 0 | 0 | CALL * 0 | 1 | NOP * 1 | 0 | JMP * 1 | 1 | RET */ return 2*tail + null; } void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) { mutex_lock(&text_mutex); if (tramp && !site) { __static_call_validate(tramp, true, true); __static_call_transform(tramp, __sc_insn(!func, true), func, false); } if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { __static_call_validate(site, tail, false); __static_call_transform(site, __sc_insn(!func, tail), func, false); } mutex_unlock(&text_mutex); } EXPORT_SYMBOL_GPL(arch_static_call_transform); noinstr void __static_call_update_early(void *tramp, void *func) { BUG_ON(system_state != SYSTEM_BOOTING); BUG_ON(static_call_initialized); __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE); sync_core(); } #ifdef CONFIG_MITIGATION_RETHUNK /* * This is called by apply_returns() to fix up static call trampolines, * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as * having a return trampoline. * * The problem is that static_call() is available before determining * X86_FEATURE_RETHUNK and, by implication, running alternatives. * * This means that __static_call_transform() above can have overwritten the * return trampoline and we now need to fix things up to be consistent. */ bool __static_call_fixup(void *tramp, u8 op, void *dest) { unsigned long addr = (unsigned long)tramp; /* * Not all .return_sites are a static_call trampoline (most are not). * Check if the 3 bytes after the return are still kernel text, if not, * then this definitely is not a trampoline and we need not worry * further. * * This avoids the memcmp() below tripping over pagefaults etc.. */ if (((addr >> PAGE_SHIFT) != ((addr + 7) >> PAGE_SHIFT)) && !kernel_text_address(addr + 7)) return false; if (memcmp(tramp+5, tramp_ud, 3)) { /* Not a trampoline site, not our problem. */ return false; } mutex_lock(&text_mutex); if (op == RET_INSN_OPCODE || dest == &__x86_return_thunk) __static_call_transform(tramp, RET, NULL, true); mutex_unlock(&text_mutex); return true; } #endif
574 18 261 148 254 261 329 12 464 465 1 1 1 1 167 173 179 123 385 18 23 386 70 40 613 68 556 12 146 134 71 384 16 22 384 30 373 8 2 2 6 3 9 630 700 15 585 163 714 701 10 384 537 124 348 456 714 700 13 455 466 176 711 612 24 215 24 215 713 715 450 15 15 72 61 11 465 452 465 454 464 454 10 314 216 464 333 21 130 465 72 70 2 6 1 5 67 4 71 72 72 70 441 439 3 102 349 341 113 177 408 441 57 2 49 18 3759 3749 189 57 205 38 203 31 208 1 79 186 628 626 627 628 50 50 50 554 27 29 585 642 348 595 592 642 465 379 95 643 10 641 641 643 627 51 21 385 382 464 632 10 642 465 386 612 208 641 640 642 641 616 239 421 585 68 82 83 83 136 42 87 30 39 1 39 2 39 39 39 115 113 1 51 52 52 52 52 52 52 52 52 40 27 52 36 36 14 13 10 36 36 36 36 36 8 35 1 299 531 557 285 626 21 630 324 4 322 317 5 324 215 170 55 261 73 219 222 79 200 2 49 26 137 227 27 6 99 33 262 82 42 72 17 17 16 103 142 61 23 4 111 112 112 112 112 112 39 78 53 56 45 46 21 16 3 111 112 112 296 249 33 7 28 1 248 249 1 1 144 210 1129 191 162 285 1294 779 1068 770 79 244 12 4 86 112 296 295 56 296 294 241 279 17 280 280 71 115 274 276 226 15 111 47 82 239 112 248 246 261 123 251 160 262 70 247 52 6 261 165 352 279 252 11 138 182 73 233 240 239 68 43 26 2 26 7 13 19 5 11 16 296 264 74 53 53 385 283 113 383 12 385 111 111 363 13 13 41 114 2 3 3 3 3 3 24 53 53 52 53 3 52 17 3 73 73 73 68 8 8 68 62 9 29 67 67 30 67 50 53 68 68 68 68 68 1 73 35 56 43 46 45 45 62 62 61 56 62 60 56 41 29 21 61 61 1 54 37 56 62 55 597 130 616 615 90 90 8 83 83 83 47 47 44 5 47 132 17 118 118 132 72 3 71 71 72 71 7 70 72 72 71 72 72 72 553 554 22 554 554 553 3 551 554 2 6 554 553 465 91 464 274 216 57 57 7 18 32 32 1 38 38 7 32 36 16 251 231 65 558 5 554 91 465 464 465 274 191 461 16 31 133 104 86 62 62 103 104 2 62 317 317 317 147 186 30 30 32 14 30 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ /* * Changes: Pedro Roque : Retransmit queue handled by TCP. * : Fragmentation on mtu decrease * : Segment collapse on retransmit * : AF independence * * Linus Torvalds : send_delayed_ack * David S. Miller : Charge memory using the right skb * during syn/ack processing. * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. * Cacophonix Gaul : draft-minshall-nagle-01 * J Hadi Salim : ECN support * */ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/mptcp.h> #include <net/smc.h> #include <net/proto_memory.h> #include <net/psp.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> #include <linux/static_key.h> #include <linux/skbuff_ref.h> #include <trace/events/tcp.h> /* Refresh clocks of a TCP socket, * ensuring monotically increasing values. */ void tcp_mstamp_refresh(struct tcp_sock *tp) { u64 val = tcp_clock_ns(); tp->tcp_clock_cache = val; tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC); } static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); if (tp->highest_sack == NULL) tp->highest_sack = skb; tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); tcp_check_space(sk); } /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one * window scaling factor due to loss of precision. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */ static inline __u32 tcp_acceptable_seq(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (!before(tcp_wnd_end(tp), tp->snd_nxt) || (tp->rx_opt.wscale_ok && ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale)))) return tp->snd_nxt; else return tcp_wnd_end(tp); } /* Calculate mss to advertise in SYN segment. * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: * * 1. It is independent of path mtu. * 2. Ideally, it is maximal possible segment size i.e. 65535-40. * 3. For IPv4 it is reasonable to calculate it from maximal MTU of * attached devices, because some buggy hosts are confused by * large MSS. * 4. We do not make 3, we advertise MSS, calculated from first * hop device mtu, but allow to raise it to ip_rt_min_advmss. * This may be overridden via information stored in routing table. * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, * probably even Jumbo". */ static __u16 tcp_advertise_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; if (dst) { unsigned int metric = dst_metric_advmss(dst); if (metric < mss) { mss = metric; tp->advmss = mss; } } return (__u16)mss; } /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 cwnd = tcp_snd_cwnd(tp); tcp_ca_event(sk, CA_EVENT_CWND_RESTART); tp->snd_ssthresh = tcp_current_ssthresh(sk); restart_cwnd = min(restart_cwnd, cwnd); while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd)); tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } /* Congestion state accounting after a packet has been sent. */ static void tcp_event_data_sent(struct tcp_sock *tp, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_jiffies32; if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, increase pingpong count. */ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) inet_csk_inc_pingpong_cnt(sk); } /* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); if (unlikely(tp->compressed_ack)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, tp->compressed_ack); tp->compressed_ack = 0; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. * NOTE: for smooth operation initial space offering should * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *__window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); u32 window_clamp = READ_ONCE(*__window_clamp); /* If no clamp set the clamp to the max possible scaled window */ if (window_clamp == 0) window_clamp = (U16_MAX << TCP_MAX_WSCALE); space = min(window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) space = rounddown(space, mss); /* NOTE: offering an initial window larger than 32767 * will break some buggy TCP stacks. If the admin tells us * it is likely we could be speaking with such a buggy stack * we will truncate our initial window offering to 32K-1 * unless the remote has sent us a window scaling option, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); space = min_t(u32, space, window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ WRITE_ONCE(*__window_clamp, min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } EXPORT_IPV6_MOD(tcp_select_initial_window); /* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return * value can be stuffed directly into th->window for an outgoing * frame. */ static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 old_win = tp->rcv_wnd; u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we * are out of memory. */ if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) { tp->pred_flags = 0; tp->rcv_wnd = 0; tp->rcv_wup = tp->rcv_nxt; return 0; } cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else * we will not be able to advertise a zero * window in time. --DaveM * * Relax Will Robinson. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) { /* Never shrink the offered window */ if (new_win == 0) NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; /* Make sure we do not exceed the maximum possible * scaled window. */ if (!tp->rx_opt.rcv_wscale && READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ if (new_win == 0) { tp->pred_flags = 0; if (old_win) NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV); } else if (old_win == 0) { NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV); } return new_win; } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_ecn_mode_any(tp)) return; if (tcp_ecn_mode_accecn(tp)) { if (!tcp_accecn_ace_fail_recv(tp)) INET_ECN_xmit(sk); tcp_accecn_set_ace(tp, skb, th); skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { INET_ECN_xmit(sk); if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) th->ece = 1; } } /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ static void tcp_init_nondata_skb(struct sk_buff *skb, struct sock *sk, u32 seq, u16 flags) { skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; tcp_skb_pcount_set(skb, 1); psp_enqueue_set_decrypted(sk, skb); TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) seq++; TCP_SKB_CB(skb)->end_seq = seq; } static inline bool tcp_urg_mode(const struct tcp_sock *tp) { return tp->snd_una != tp->snd_up; } #define OPTION_SACK_ADVERTISE BIT(0) #define OPTION_TS BIT(1) #define OPTION_MD5 BIT(2) #define OPTION_WSCALE BIT(3) #define OPTION_FAST_OPEN_COOKIE BIT(8) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) #define OPTION_ACCECN BIT(12) static void smc_options_write(__be32 *ptr, u16 *options) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (unlikely(OPTION_SMC & *options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_EXP << 8) | (TCPOLEN_EXP_SMC_BASE)); *ptr++ = htonl(TCPOPT_SMC_MAGIC); } } #endif } struct tcp_out_options { u16 options; /* bit field of OPTION_* */ u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 num_accecn_fields:7, /* number of AccECN fields needed */ use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ struct mptcp_out_options mptcp; }; static void mptcp_options_write(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) mptcp_write_options(th, ptr, tp, &opts->mptcp); #endif } #ifdef CONFIG_CGROUP_BPF static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, enum tcp_synack_type synack_type) { if (unlikely(!skb)) return BPF_WRITE_HDR_TCP_CURRENT_MSS; if (unlikely(synack_type == TCP_SYNACK_COOKIE)) return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; return 0; } /* req, syn_skb and synack_type are used when writing synack */ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { struct bpf_sock_ops_kern sock_ops; int err; if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || !*remaining) return; /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ /* init sock_ops */ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; if (req) { /* The listen "sk" cannot be passed here because * it is not locked. It would not make too much * sense to do bpf_setsockopt(listen_sk) based * on individual connection request also. * * Thus, "req" is passed here and the cgroup-bpf-progs * of the listen "sk" will be run. * * "req" is also used here for fastopen even the "sk" here is * a fullsock "child" sk. It is to keep the behavior * consistent between fastopen and non-fastopen on * the bpf programming side. */ sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = *remaining; /* tcp_current_mss() does not pass a skb */ if (skb) bpf_skops_init_skb(&sock_ops, skb, 0); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err || sock_ops.remaining_opt_len == *remaining) return; opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; /* round up to 4 bytes */ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; *remaining -= opts->bpf_opt_len; } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; struct bpf_sock_ops_kern sock_ops; int err; if (likely(!max_opt_len)) return; memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; if (req) { sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = max_opt_len; first_opt_off = tcp_hdrlen(skb) - max_opt_len; bpf_skops_init_skb(&sock_ops, skb, first_opt_off); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err) nr_written = 0; else nr_written = max_opt_len - sock_ops.remaining_opt_len; if (nr_written < max_opt_len) memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, max_opt_len - nr_written); } #else static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { } #endif static __be32 *process_tcp_ao_options(struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key, __be32 *ptr) { #ifdef CONFIG_TCP_AO u8 maclen = tcp_ao_maclen(key->ao_key); if (tcprsk) { u8 aolen = maclen + sizeof(struct tcp_ao_hdr); *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) | (tcprsk->ao_keyid << 8) | (tcprsk->ao_rcv_next)); } else { struct tcp_ao_key *rnext_key; struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk)); rnext_key = READ_ONCE(ao_info->rnext_key); if (WARN_ON_ONCE(!rnext_key)) return ptr; *ptr++ = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | (rnext_key->rcvid)); } opts->hash_location = (__u8 *)ptr; ptr += maclen / sizeof(*ptr); if (unlikely(maclen % sizeof(*ptr))) { memset(ptr, TCPOPT_NOP, sizeof(*ptr)); ptr++; } #endif return ptr; } /* Initial values for AccECN option, ordered is based on ECN field bits * similar to received_ecn_bytes. Used for SYN/ACK AccECN option. */ static const u32 synack_ecn_bytes[3] = { 0, 0, 0 }; /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from * inter-operability perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). * * At least SACK_PERM as the first option is known to lead to a disaster * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key) { u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */ u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */ __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ if (tcp_key_is_md5(key)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* overload cookie hash location */ opts->hash_location = (__u8 *)ptr; ptr += 4; } else if (tcp_key_is_ao(key)) { ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr); } if (unlikely(opts->mss)) { *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss); } if (likely(OPTION_TS & options)) { if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); options &= ~OPTION_SACK_ADVERTISE; } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); } *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); } if (OPTION_ACCECN & options) { const u32 *ecn_bytes = opts->use_synack_ecn_bytes ? synack_ecn_bytes : tp->received_ecn_bytes; const u8 ect0_idx = INET_ECN_ECT_0 - 1; const u8 ect1_idx = INET_ECN_ECT_1 - 1; const u8 ce_idx = INET_ECN_CE - 1; u32 e0b; u32 e1b; u32 ceb; u8 len; e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; len = TCPOLEN_ACCECN_BASE + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; if (opts->num_accecn_fields == 2) { *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | ((e1b >> 8) & 0xffff)); *ptr++ = htonl(((e1b & 0xff) << 24) | (ceb & 0xffffff)); } else if (opts->num_accecn_fields == 1) { *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | ((e1b >> 8) & 0xffff)); leftover_highbyte = e1b & 0xff; leftover_lowbyte = TCPOPT_NOP; } else if (opts->num_accecn_fields == 0) { leftover_highbyte = TCPOPT_ACCECN1; leftover_lowbyte = len; } else if (opts->num_accecn_fields == 3) { *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | ((e1b >> 8) & 0xffff)); *ptr++ = htonl(((e1b & 0xff) << 24) | (ceb & 0xffffff)); *ptr++ = htonl(((e0b & 0xffffff) << 8) | TCPOPT_NOP); } if (tp) { tp->accecn_minlen = 0; tp->accecn_opt_tstamp = tp->tcp_mstamp; if (tp->accecn_opt_demand) tp->accecn_opt_demand--; } } if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((leftover_highbyte << 24) | (leftover_lowbyte << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); leftover_highbyte = TCPOPT_NOP; leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_WSCALE & options)) { u8 highbyte = TCPOPT_NOP; /* Do not split the leftover 2-byte to fit into a single * NOP, i.e., replace this NOP only when 1 byte is leftover * within leftover_highbyte. */ if (unlikely(leftover_highbyte != TCPOPT_NOP && leftover_lowbyte == TCPOPT_NOP)) { highbyte = leftover_highbyte; leftover_highbyte = TCPOPT_NOP; } *ptr++ = htonl((highbyte << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); } if (unlikely(opts->num_sack_blocks)) { struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; int this_sack; *ptr++ = htonl((leftover_highbyte << 24) | (leftover_lowbyte << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); leftover_highbyte = TCPOPT_NOP; leftover_lowbyte = TCPOPT_NOP; for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); } tp->rx_opt.dsack = 0; } else if (unlikely(leftover_highbyte != TCPOPT_NOP || leftover_lowbyte != TCPOPT_NOP)) { *ptr++ = htonl((leftover_highbyte << 24) | (leftover_lowbyte << 16) | (TCPOPT_NOP << 8) | TCPOPT_NOP); leftover_highbyte = TCPOPT_NOP; leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; u8 *p = (u8 *)ptr; u32 len; /* Fast Open option length */ if (foc->exp) { len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | TCPOPT_FASTOPEN_MAGIC); p += TCPOLEN_EXP_FASTOPEN_BASE; } else { len = TCPOLEN_FASTOPEN_BASE + foc->len; *p++ = TCPOPT_FASTOPEN; *p++ = len; } memcpy(p, foc->val, foc->len); if ((len & 3) == 2) { p[foc->len] = TCPOPT_NOP; p[foc->len + 1] = TCPOPT_NOP; } ptr += (len + 3) >> 2; } smc_options_write(ptr, &options); mptcp_options_write(th, ptr, tp, opts); } static void smc_set_option(struct tcp_sock *tp, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) { tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option); /* re-check syn_smc */ if (tp->syn_smc && *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } #endif } static void smc_set_option_cond(const struct tcp_sock *tp, struct inet_request_sock *ireq, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) { ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq); /* re-check smc_ok */ if (ireq->smc_ok && *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } #endif } static void mptcp_set_option_cond(const struct request_sock *req, struct tcp_out_options *opts, unsigned int *remaining) { if (rsk_is_mptcp(req)) { unsigned int size; if (mptcp_synack_options(req, &size, &opts->mptcp)) { if (*remaining >= size) { opts->options |= OPTION_MPTCP; *remaining -= size; } } } } static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) { /* How much there's room for combining with the alignment padding? */ if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) == OPTION_SACK_ADVERTISE) return 2; else if (opts->options & OPTION_WSCALE) return 1; return 0; } /* Calculates how long AccECN option will fit to @remaining option space. * * AccECN option can sometimes replace NOPs used for alignment of other * TCP options (up to @max_combine_saving available). * * Only solutions with at least @required AccECN fields are accepted. * * Returns: The size of the AccECN option excluding space repurposed from * the alignment of the other options. */ static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, int remaining) { int size = TCP_ACCECN_MAXSIZE; int sack_blocks_reduce = 0; int max_combine_saving; int rem = remaining; int align_size; if (opts->use_synack_ecn_bytes) max_combine_saving = tcp_synack_options_combine_saving(opts); else max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0; opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; while (opts->num_accecn_fields >= required) { /* Pad to dword if cannot combine */ if ((size & 0x3) > max_combine_saving) align_size = ALIGN(size, 4); else align_size = ALIGN_DOWN(size, 4); if (rem >= align_size) { size = align_size; break; } else if (opts->num_accecn_fields == required && opts->num_sack_blocks > 2 && required > 0) { /* Try to fit the option by removing one SACK block */ opts->num_sack_blocks--; sack_blocks_reduce++; rem = rem + TCPOLEN_SACK_PERBLOCK; opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; size = TCP_ACCECN_MAXSIZE; continue; } opts->num_accecn_fields--; size -= TCPOLEN_ACCECN_PERFIELD; } if (sack_blocks_reduce > 0) { if (opts->num_accecn_fields >= required) size -= sack_blocks_reduce * TCPOLEN_SACK_PERBLOCK; else opts->num_sack_blocks += sack_blocks_reduce; } if (opts->num_accecn_fields < required) return 0; opts->options |= OPTION_ACCECN; return size; } /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int remaining = MAX_TCP_OPTION_SPACE; struct tcp_fastopen_request *fastopen = tp->fastopen_req; bool timestamps; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { timestamps = false; opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; } else { timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps); if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); } } /* We always get an MSS option. The option bytes which will be seen in * normal data packets should timestamps be used, must be in the MSS * advertised. But we subtract them from tp->mss_cache so that * calculations in tcp_sendmsg are simpler etc. So account for this * fact here if necessary. If we don't do this correctly, as a * receiver we won't recognize data packets as being full sized when we * should, and thus we won't abide by the delayed ACK rules correctly. * SACKs don't matter, we never delay an ACK when we have any of those * going out. */ opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; if (likely(timestamps)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (fastopen && fastopen->cookie.len >= 0) { u32 need = fastopen->cookie.len; need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = &fastopen->cookie; remaining -= need; tp->syn_fastopen = 1; tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0; } } smc_set_option(tp, opts, &remaining); if (sk_is_mptcp(sk)) { unsigned int size; if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { if (remaining >= size) { opts->options |= OPTION_MPTCP; remaining -= size; } } } /* Simultaneous open SYN/ACK needs AccECN option but not SYN. * It is attempted to negotiate the use of AccECN also on the first * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs" * of AccECN draft. */ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && tcp_ecn_mode_accecn(tp) && inet_csk(sk)->icsk_retransmits < 2 && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && remaining >= TCPOLEN_ACCECN_BASE)) { opts->use_synack_ecn_bytes = 1; remaining -= tcp_options_fit_accecn(opts, 0, remaining); } bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ static unsigned int tcp_synack_options(const struct sock *sk, struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_key *key, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; struct tcp_request_sock *treq = tcp_rsk(req); if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; /* We can't fit any SACK blocks in a packet with MD5 + TS * options. There was discussion about disabling SACK * rather than TS in order to fit in better with old, * buggy kernels, but that was deemed to be unnecessary. */ if (synack_type != TCP_SYNACK_COOKIE) ireq->tstamp_ok &= !ireq->sack_ok; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); ireq->tstamp_ok &= !ireq->sack_ok; } /* We always send an MSS option. */ opts->mss = mss; remaining -= TCPOLEN_MSS_ALIGNED; if (likely(ireq->wscale_ok)) { opts->ws = ireq->rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + tcp_rsk(req)->ts_off; if (!tcp_rsk(req)->snt_tsval_first) { if (!opts->tsval) opts->tsval = ~0U; tcp_rsk(req)->snt_tsval_first = opts->tsval; } WRITE_ONCE(tcp_rsk(req)->snt_tsval_last, opts->tsval); opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (foc != NULL && foc->len >= 0) { u32 need = foc->len; need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = foc; remaining -= need; } } mptcp_set_option_cond(req, opts, &remaining); smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); if (treq->accecn_ok && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { opts->use_synack_ecn_bytes = 1; remaining -= tcp_options_fit_accecn(opts, 0, remaining); } bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Compute TCP options for ESTABLISHED sockets. This is not the * final wire format yet. */ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks; opts->options = 0; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; size += TCPOLEN_MD5SIG_ALIGNED; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; size += tcp_ao_len_aligned(key->ao_key); } if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } /* MPTCP options have precedence over SACK for the limited TCP * option space because a MPTCP connection would be forced to * fall back to regular TCP if a required multipath option is * missing. SACK still gets a chance to use whatever space is * left. */ if (sk_is_mptcp(sk)) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; unsigned int opt_size = 0; if (mptcp_established_options(sk, skb, &opt_size, remaining, &opts->mptcp)) { opts->options |= OPTION_MPTCP; size += opt_size; } } eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; if (likely(remaining >= TCPOLEN_SACK_BASE_ALIGNED + TCPOLEN_SACK_PERBLOCK)) { opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); size += TCPOLEN_SACK_BASE_ALIGNED + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } else { opts->num_sack_blocks = 0; } } else { opts->num_sack_blocks = 0; } if (tcp_ecn_mode_accecn(tp)) { int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk))) { opts->use_synack_ecn_bytes = 0; size += tcp_options_fit_accecn(opts, tp->accecn_minlen, MAX_TCP_OPTION_SPACE - size); } } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); size = MAX_TCP_OPTION_SPACE - remaining; } return size; } /* TCP SMALL QUEUES (TSQ) * * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) * to reduce RTT and bufferbloat. * We do this using a special skb destructor (tcp_wfree). * * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb * needs to be reallocated in a driver. * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * * Since transmit from skb destructor is forbidden, we use a BH work item * to process all sockets that eventually need to send more skbs. * We use one work item per cpu, with its own queue of sockets. */ struct tsq_work { struct work_struct work; struct list_head head; /* queue of tcp sockets */ }; static DEFINE_PER_CPU(struct tsq_work, tsq_work); static void tcp_tsq_write(struct sock *sk) { if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) { struct tcp_sock *tp = tcp_sk(sk); if (tp->lost_out > tp->retrans_out && tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) { tcp_mstamp_refresh(tp); tcp_xmit_retransmit_queue(sk); } tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, 0, GFP_ATOMIC); } } static void tcp_tsq_handler(struct sock *sk) { bh_lock_sock(sk); if (!sock_owned_by_user(sk)) tcp_tsq_write(sk); else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); bh_unlock_sock(sk); } /* * One work item per cpu tries to send more skbs. * We run in BH context but need to disable irqs when * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ static void tcp_tsq_workfn(struct work_struct *work) { struct tsq_work *tsq = container_of(work, struct tsq_work, work); LIST_HEAD(list); unsigned long flags; struct list_head *q, *n; struct tcp_sock *tp; struct sock *sk; local_irq_save(flags); list_splice_init(&tsq->head, &list); local_irq_restore(flags); list_for_each_safe(q, n, &list) { tp = list_entry(q, struct tcp_sock, tsq_node); list_del(&tp->tsq_node); sk = (struct sock *)tp; smp_mb__before_atomic(); clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); tcp_tsq_handler(sk); sk_free(sk); } } #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ TCPF_WRITE_TIMER_DEFERRED | \ TCPF_DELACK_TIMER_DEFERRED | \ TCPF_MTU_REDUCED_DEFERRED | \ TCPF_ACK_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket * * called from release_sock() to perform protocol dependent * actions before socket release. */ void tcp_release_cb(struct sock *sk) { unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags); unsigned long nflags; /* perform an atomic operation only if at least one flag is set */ do { if (!(flags & TCP_DEFERRED_ALL)) return; nflags = flags & ~TCP_DEFERRED_ALL; } while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags)); if (flags & TCPF_TSQ_DEFERRED) { tcp_tsq_write(sk); __sock_put(sk); } if (flags & TCPF_WRITE_TIMER_DEFERRED) { tcp_write_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_DELACK_TIMER_DEFERRED) { tcp_delack_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_MTU_REDUCED_DEFERRED) { inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk)) tcp_send_ack(sk); } EXPORT_IPV6_MOD(tcp_release_cb); void __init tcp_tsq_work_init(void) { int i; for_each_possible_cpu(i) { struct tsq_work *tsq = &per_cpu(tsq_work, i); INIT_LIST_HEAD(&tsq->head); INIT_WORK(&tsq->work, tcp_tsq_workfn); } } /* * Write buffer destructor automatically called from kfree_skb. * We can't xmit new skbs from this context, as we might already * hold qdisc lock. */ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nval, oval; struct tsq_work *tsq; bool empty; /* Keep one reference on sk_wmem_alloc. * Will be released by sk_free() from here or tcp_tsq_workfn() */ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc)); /* If this softirq is serviced by ksoftirqd, we are likely under stress. * Wait until our queues (qdisc + devices) are drained. * This gives : * - less callbacks to tcp_write_xmit(), reducing stress (batches) * - chance for incoming ACK (processed by another cpu maybe) * to migrate this flow (skb->ooo_okay will be eventually set) */ if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) goto out; oval = smp_load_acquire(&sk->sk_tsq_flags); do { if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) goto out; nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval)); /* queue this socket to BH workqueue */ local_irq_save(flags); tsq = this_cpu_ptr(&tsq_work); empty = list_empty(&tsq->head); list_add(&tp->tsq_node, &tsq->head); if (empty) queue_work(system_bh_wq, &tsq->work); local_irq_restore(flags); return; out: sk_free(sk); } /* Note: Called under soft irq. * We can call TCP stack right away, unless socket is owned by user. */ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) { struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); struct sock *sk = (struct sock *)tp; tcp_tsq_handler(sk); sock_put(sk); return HRTIMER_NORESTART; } static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, u64 prior_wstamp) { struct tcp_sock *tp = tcp_sk(sk); if (sk->sk_pacing_status != SK_PACING_NONE) { unsigned long rate = READ_ONCE(sk->sk_pacing_rate); /* Original sch_fq does not pace first 10 MSS * Note that tp->data_segs_out overflows after 2^32 packets, * this is a minor annoyance. */ if (rate != ~0UL && rate && tp->data_segs_out >= 10) { u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate); u64 credit = tp->tcp_wstamp_ns - prior_wstamp; /* take into account OS jitter */ len_ns -= min_t(u64, len_ns / 2, credit); tp->tcp_wstamp_ns += len_ns; } } list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. * All SKB's seen here are completely headerless. It is our * job to build the TCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. * * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; struct tcp_out_options opts; unsigned int tcp_options_size, tcp_header_size; struct sk_buff *oskb = NULL; struct tcp_key key; struct tcphdr *th; u64 prior_wstamp; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); if (clone_it) { oskb = skb; tcp_skb_tsorted_save(oskb) { if (unlikely(skb_cloned(oskb))) skb = pskb_copy(oskb, gfp_mask); else skb = skb_clone(oskb, gfp_mask); } tcp_skb_tsorted_restore(oskb); if (unlikely(!skb)) return -ENOBUFS; /* retransmit skbs might have a non zero value in skb->dev * because skb->dev is aliased with skb->rbnode.rb_left */ skb->dev = NULL; } inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); tcp_get_current_key(sk, &key); if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { tcp_options_size = tcp_syn_options(sk, skb, &opts, &key); } else { tcp_options_size = tcp_established_options(sk, skb, &opts, &key); /* Force a PSH flag on all (GSO) packets to expedite GRO flush * at receiver : This slightly improve GRO performance. * Note that we do not force the PSH flag for non GSO packets, * because they might be sent under high congestion events, * and in this case it is better to delay the delivery of 1-MSS * packets and thus the corresponding ACK packet that would * release the following packet. */ if (tcp_skb_pcount(skb) > 1) tcb->tcp_flags |= TCPHDR_PSH; } tcp_header_size = tcp_options_size + sizeof(struct tcphdr); /* We set skb->ooo_okay to one if this packet can select * a different TX queue than prior packets of this flow, * to avoid self inflicted reorders. * The 'other' queue decision is based on current cpu number * if XPS is enabled, or sk->sk_txhash otherwise. * We can switch to another (and better) queue if: * 1) No packet with payload is in qdisc/device queues. * Delays in TX completion can defeat the test * even if packets were already sent. * 2) Or rtx queue is empty. * This mitigates above case if ACK packets for * all prior packets were already processed. */ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) || tcp_rtx_queue_empty(sk); /* If we had to use memory reserve to allocate this skb, * this might cause drops if packet is looped back : * Other socket might not have SOCK_MEMALLOC. * Packets not looped back do not care about pfmemalloc. */ skb->pfmemalloc = 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_orphan(skb); skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm)); /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | (tcb->tcp_flags & TCPHDR_FLAGS_MASK)); th->check = 0; th->urg_ptr = 0; /* The urg_mode check is necessary during a below snd_una win probe */ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { if (before(tp->snd_up, tcb->seq + 0x10000)) { th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { th->urg_ptr = htons(0xFFFF); th->urg = 1; } } skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { th->window = htons(tcp_select_window(sk)); tcp_ecn_send(sk, skb, th, tcp_header_size); } else { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(min(tp->rcv_wnd, 65535U)); } tcp_options_write(th, tp, NULL, &opts, &key); if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ sk_gso_disable(sk); tp->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, sk, skb); #endif } else if (tcp_key_is_ao(&key)) { int err; err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th, opts.hash_location); if (err) { sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_NOT_SPECIFIED); return -ENOMEM; } } /* BPF prog is the last one writing header option */ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); tp->bytes_sent += skb->len - tcp_header_size; } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); skb_set_hash_from_sk(skb, sk); /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ /* Cleanup our debris for IP stacks */ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); tcp_add_tx_delay(skb, tp); err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, inet6_csk_xmit, ip_queue_xmit, sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); err = net_xmit_eval(err); } if (!err && oskb) { tcp_update_skb_after_send(sk, oskb, prior_wstamp); tcp_rate_skb_sent(sk, oskb); } return err; } static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, tcp_sk(sk)->rcv_nxt); } /* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. */ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } /* Initialize TSO segments for a packet. */ static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs; if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ TCP_SKB_CB(skb)->tcp_gso_size = 0; tcp_skb_pcount_set(skb, 1); return 1; } TCP_SKB_CB(skb)->tcp_gso_size = mss_now; tso_segs = DIV_ROUND_UP(skb->len, mss_now); tcp_skb_pcount_set(skb, tso_segs); return tso_segs; } /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); tp->packets_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) tp->lost_out -= decr; /* Reno case is special. Sigh... */ if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); tcp_verify_left_out(tp); } static bool tcp_has_tx_tstamp(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->txstamp_ack || (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP); } static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (unlikely(tcp_has_tx_tstamp(skb)) && !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { struct skb_shared_info *shinfo2 = skb_shinfo(skb2); u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tx_flags &= ~tsflags; shinfo2->tx_flags |= tsflags; swap(shinfo->tskey, shinfo2->tskey); TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack; TCP_SKB_CB(skb)->txstamp_ack = 0; } } static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) { TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor; TCP_SKB_CB(skb)->eor = 0; } /* Insert buff after skb on the write or rtx queue of sk. */ static void tcp_insert_write_queue_after(struct sk_buff *skb, struct sk_buff *buff, struct sock *sk, enum tcp_queue tcp_queue) { if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) __skb_queue_after(&sk->sk_write_queue, skb, buff); else tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); } /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int old_factor; long limit; u16 flags; int nlen; if (WARN_ON(len > skb->len)) return -EINVAL; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. * We need some allowance to not penalize applications setting small * SO_SNDBUF values. * Also allow first and last skb in retransmit queue to be split. */ limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE); if (unlikely((sk->sk_wmem_queued >> 1) > limit && tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && skb != tcp_rtx_queue_head(sk) && skb != tcp_rtx_queue_tail(sk))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); return -ENOMEM; } if (skb_unclone_keeptruesize(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ buff = tcp_stream_alloc_skb(sk, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); nlen = skb->len - len; buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Update delivered info for the new segment */ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx; /* If this packet has been sent out already, we must * adjust the various packet counters. */ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { int diff = old_factor - tcp_skb_pcount(skb) - tcp_skb_pcount(buff); if (diff) tcp_adjust_pcount(sk, skb, diff); } /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } /* This is similar to __pskb_pull_tail(). The difference is that pulled * data is not copied, but immediately discarded. */ static int __pskb_trim_head(struct sk_buff *skb, int len) { struct skb_shared_info *shinfo; int i, k, eat; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); eat = len; k = 0; shinfo = skb_shinfo(skb); for (i = 0; i < shinfo->nr_frags; i++) { int size = skb_frag_size(&shinfo->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { shinfo->frags[k] = shinfo->frags[i]; if (eat) { skb_frag_off_add(&shinfo->frags[k], eat); skb_frag_size_sub(&shinfo->frags[k], eat); eat = 0; } k++; } } shinfo->nr_frags = k; skb->data_len -= len; skb->len = skb->data_len; return len; } /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); if (!skb_zcopy_pure(skb)) sk_mem_uncharge(sk, delta_truesize); /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } /* Calculate MSS not accounting any TCP options. */ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ mss_now = max(mss_now, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss)); return mss_now; } /* Calculate MSS. Not accounting for SACKs here. */ int tcp_mtu_to_mss(struct sock *sk, int pmtu) { /* Subtract TCP options size, not including SACKs */ return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } EXPORT_IPV6_MOD(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); return mss + tp->tcp_header_len + icsk->icsk_ext_hdr_len + icsk->icsk_af_ops->net_header_len; } EXPORT_SYMBOL(tcp_mss_to_mtu); /* MTU probing init per socket */ void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss)); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. It is minimum of user_mss and mss received with SYN. It also does not include TCP options. inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, taking into account current pmtu, but never exceeds tp->rx_opt.mss_clamp. NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache are READ ONLY outside this function. --ANK (980731) */ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; if (icsk->icsk_mtup.search_high > pmtu) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); mss_now = tcp_bound_to_half_wnd(tp, mss_now); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; if (icsk->icsk_mtup.enabled) mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); tp->mss_cache = mss_now; return mss_now; } EXPORT_IPV6_MOD(tcp_sync_mss); /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. */ unsigned int tcp_current_mss(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; unsigned int header_len; struct tcp_out_options opts; struct tcp_key key; mss_now = tp->mss_cache; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } tcp_get_current_key(sk, &key); header_len = tcp_established_options(sk, NULL, &opts, &key) + sizeof(struct tcphdr); /* The mss_cache is sized based on tp->tcp_header_len, which assumes * some common options. If this is an odd packet (because we have SACK * blocks etc) then our calculated header_len will be different, and * we have to adjust mss_now correspondingly */ if (header_len != tp->tcp_header_len) { int delta = (int) header_len - tp->tcp_header_len; mss_now -= delta; } return mss_now; } /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, * and if application hit its sndbuf limit recently. */ static void tcp_cwnd_application_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 win_used = max(tp->snd_cwnd_used, init_win); if (win_used < tcp_snd_cwnd(tp)) { tp->snd_ssthresh = tcp_current_ssthresh(sk); tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1); } tp->snd_cwnd_used = 0; } tp->snd_cwnd_stamp = tcp_jiffies32; } static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); /* Track the strongest available signal of the degree to which the cwnd * is fully utilized. If cwnd-limited then remember that fact for the * current window. If not cwnd-limited then track the maximum number of * outstanding packets in the current window. (If cwnd-limited then we * chose to not update tp->max_packets_out to avoid an extra else * clause with no functional impact.) */ if (!before(tp->snd_una, tp->cwnd_usage_seq) || is_cwnd_limited || (!tp->is_cwnd_limited && tp->packets_out > tp->max_packets_out)) { tp->is_cwnd_limited = is_cwnd_limited; tp->max_packets_out = tp->packets_out; tp->cwnd_usage_seq = tp->snd_nxt; } if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_jiffies32; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); /* The following conditions together indicate the starvation * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); } } /* Minshall's variant of the Nagle send check. */ static bool tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml, tp->snd_una) && !after(tp->snd_sml, tp->snd_nxt); } /* Update snd_sml if this skb is under mss * Note that a TSO packet might end with a sub-mss segment * The test is really : * if ((skb->len % mss) != 0) * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; * But we can avoid doing the divide again given we already have * skb_pcount = skb->len / mss_now */ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, const struct sk_buff *skb) { if (skb->len < tcp_skb_pcount(skb) * mss_now) tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } /* Return false, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. (provided by caller in %partial bool) * 2. Or it contains FIN. (already checked by caller) * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, int nonagle) { return partial && ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } /* Return how many segs we'd like on a TSO packet, * depending on current pacing rate, and how close the peer is. * * Rationale is: * - For close peers, we rather send bigger packets to reduce * cpu costs, because occasional losses will be repaired fast. * - For long distance/rtt flows, we would like to get ACK clocking * with 1 ACK per ms. * * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting * in bigger TSO bursts. We we cut the RTT-based allowance in half * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance * is below 1500 bytes after 6 * ~500 usec = 3ms. */ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs) { unsigned long bytes; u32 r; bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log); if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) bytes += sk->sk_gso_max_size >> r; bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size); return max_t(u32, bytes / mss_now, min_tso_segs); } /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. */ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; u32 min_tso, tso_segs; min_tso = ca_ops->min_tso_segs ? ca_ops->min_tso_segs(sk) : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, unsigned int mss_now, unsigned int max_segs, int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) return max_len; needed = min(skb->len, window); if (max_len <= needed) return max_len; partial = needed % mss_now; /* If last segment is not a full MSS, check if Nagle rules allow us * to include this last segment in this skb. * Otherwise, we'll split the skb at last MSS boundary */ if (tcp_nagle_check(partial != 0, tp, nonagle)) return needed - partial; return needed; } /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ static u32 tcp_cwnd_test(const struct tcp_sock *tp) { u32 in_flight, cwnd, halfcwnd; in_flight = tcp_packets_in_flight(tp); cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) return 0; /* For better scheduling, ensure we have at least * 2 GSO packets in flight. */ halfcwnd = max(cwnd >> 1, 1U); return min(halfcwnd, cwnd - in_flight); } /* Initialize TSO state of a skb. * This must be invoked the first time we consider transmitting * SKB onto the wire. */ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) return tcp_set_skb_tso_segs(skb, mss_now); return tso_segs; } /* Return true if the Nagle test allows this packet to be * sent now. */ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the * write_queue (they have no chances to get new data). * * This is implemented in the callers, where they modify the 'nonagle' * argument based upon the location of SKB in the send queue. */ if (nonagle & TCP_NAGLE_PUSH) return true; /* Don't use the nagle rule for urgent data (or for the final FIN). */ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) return true; return false; } /* Does at least the first segment of SKB fit into the send window? */ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (skb->len > cur_mss) end_seq = TCP_SKB_CB(skb)->seq + cur_mss; return !after(end_seq, tcp_wnd_end(tp)); } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions * in order to speed up the splitting operation. In particular, we * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { int nlen = skb->len - len; struct sk_buff *buff; u16 flags; /* All of a TSO frame must be composed of paged data. */ DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); buff = tcp_stream_alloc_skb(sk, gfp, true); if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE); return 0; } /* Try to defer sending, if possible, in order to minimize the amount * of TSO splitting we do. View it as a kind of TSO Nagle test. * * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, bool *is_cwnd_limited, bool *is_rwnd_limited, u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight, threshold; u64 srtt_in_ns, expected_ack, how_far_is_the_ack; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; s64 delta; if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer * only if the last write was recent (1 ms). * Note that tp->tcp_wstamp_ns can be in the future if we have * packets waiting in a qdisc or device for EDT delivery. */ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; if (delta > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); BUG_ON(tcp_skb_pcount(skb) <= 1); BUG_ON(tcp_snd_cwnd(tp) <= in_flight); send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* From in_flight test above, we know that cwnd > in_flight. */ cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache; limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ if (limit >= max_segs * tp->mss_cache) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. */ chunk /= win_divisor; if (limit >= chunk) goto send_now; } else { /* Different approach, try not to defer past a single * ACK. Receiver should ACK every other full sized * frame, so if we have space for more than 3 frames * then send now. */ if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) goto send_now; } /* TODO : use tsorted_sent_queue ? */ head = tcp_rtx_queue_head(sk); if (!head) goto send_now; srtt_in_ns = (u64)(NSEC_PER_USEC >> 3) * tp->srtt_us; /* When is the ACK expected ? */ expected_ack = head->tstamp + srtt_in_ns; /* How far from now is the ACK expected ? */ how_far_is_the_ack = expected_ack - tp->tcp_clock_cache; /* If next ACK is likely to come too late, * ie in more than min(1ms, half srtt), do not defer. */ threshold = min(srtt_in_ns >> 1, NSEC_PER_MSEC); if ((s64)(how_far_is_the_ack - threshold) > 0) goto send_now; /* Ok, it looks like it is advisable to defer. * Three cases are tracked : * 1) We are cwnd-limited * 2) We are rwnd-limited * 3) We are application limited. */ if (cong_win < send_win) { if (cong_win <= skb->len) { *is_cwnd_limited = true; return true; } } else { if (send_win <= skb->len) { *is_rwnd_limited = true; return true; } } /* If this packet won't get more data, do not wait. */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || TCP_SKB_CB(skb)->eor) goto send_now; return true; send_now: return false; } static inline void tcp_mtu_check_reprobe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 interval; s32 delta; interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval); delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); /* Update current search range */ icsk->icsk_mtup.probe_size = 0; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); /* Update probe time stamp */ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } } static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) { struct sk_buff *skb, *next; skb = tcp_send_head(sk); tcp_for_write_queue_from_safe(skb, next, sk) { if (len <= skb->len) break; if (tcp_has_tx_tstamp(skb) || !tcp_skb_can_collapse(skb, next)) return false; len -= skb->len; } return true; } static int tcp_clone_payload(struct sock *sk, struct sk_buff *to, int probe_size) { skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags; int i, todo, len = 0, nr_frags = 0; const struct sk_buff *skb; if (!sk_wmem_schedule(sk, to->truesize + probe_size)) return -ENOMEM; skb_queue_walk(&sk->sk_write_queue, skb) { const skb_frag_t *fragfrom = skb_shinfo(skb)->frags; if (skb_headlen(skb)) return -EINVAL; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) { if (len >= probe_size) goto commit; todo = min_t(int, skb_frag_size(fragfrom), probe_size - len); len += todo; if (lastfrag && skb_frag_page(fragfrom) == skb_frag_page(lastfrag) && skb_frag_off(fragfrom) == skb_frag_off(lastfrag) + skb_frag_size(lastfrag)) { skb_frag_size_add(lastfrag, todo); continue; } if (unlikely(nr_frags == MAX_SKB_FRAGS)) return -E2BIG; skb_frag_page_copy(fragto, fragfrom); skb_frag_off_copy(fragto, fragfrom); skb_frag_size_set(fragto, todo); nr_frags++; lastfrag = fragto++; } } commit: WARN_ON_ONCE(len != probe_size); for (i = 0; i < nr_frags; i++) skb_frag_ref(to, i); skb_shinfo(to)->nr_frags = nr_frags; to->truesize += probe_size; to->len += probe_size; to->data_len += probe_size; __skb_header_release(to); return 0; } /* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if * all its payload was moved to another one (dst). * Make sure to transfer tcp_flags, eor, and tstamp. */ static void tcp_eat_one_skb(struct sock *sk, struct sk_buff *dst, struct sk_buff *src) { TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags; TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor; tcp_skb_collapse_tstamp(dst, src); tcp_unlink_write_queue(src, sk); tcp_wmem_free_skb(sk, src); } /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing * changes resulting in larger path MTUs. * * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, * -1 otherwise */ static int tcp_mtu_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *nskb, *next; struct net *net = sock_net(sk); int probe_size; int size_needed; int copy, len; int mss_now; int interval; /* Not currently probing/verifying, * not in recovery, * have enough cwnd, and * not SACKing (the variable headers throw things off) */ if (likely(!icsk->icsk_mtup.enabled || icsk->icsk_mtup.probe_size || inet_csk(sk)->icsk_ca_state != TCP_CA_Open || tcp_snd_cwnd(tp) < 11 || tp->rx_opt.num_sacks || tp->rx_opt.dsack)) return -1; /* Use binary search for probe_size between tcp_mss_base, * and current mss_clamp. if (search_high - search_low) * smaller than a threshold, backoff from probing. */ mss_now = tcp_current_mss(sk); probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; /* When misfortune happens, we are reprobing actively, * and then reprobe timer has expired. We stick with current * probing process by not resetting search range to its orignal. */ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) { /* Check whether enough time has elaplased for * another round of probing. */ tcp_mtu_check_reprobe(sk); return -1; } /* Have enough data in the send queue to probe? */ if (tp->write_seq - tp->snd_nxt < size_needed) return -1; if (tp->snd_wnd < size_needed) return -1; if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) return 0; /* Do we need to wait to drain cwnd? With none in flight, don't stall */ if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) { if (!tcp_packets_in_flight(tp)) return -1; else return 0; } if (!tcp_can_coalesce_send_queue_head(sk, probe_size)) return -1; /* We're allowed to probe. Build it now. */ nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false); if (!nskb) return -1; /* build the payload, and be prepared to abort if this fails. */ if (tcp_clone_payload(sk, nskb, probe_size)) { tcp_skb_tsorted_anchor_cleanup(nskb); consume_skb(nskb); return -1; } sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); skb_copy_decrypted(nskb, skb); mptcp_skb_ext_copy(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); if (skb->len <= copy) { tcp_eat_one_skb(sk, nskb, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(skb, mss_now); TCP_SKB_CB(skb)->seq += copy; } len += copy; if (len >= probe_size) break; } tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). */ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); tcp_event_new_data_sent(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; return 1; } return -1; } static bool tcp_pacing_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_needs_internal_pacing(sk)) return false; if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache) return false; if (!hrtimer_is_queued(&tp->pacing_timer)) { hrtimer_start(&tp->pacing_timer, ns_to_ktime(tp->tcp_wstamp_ns), HRTIMER_MODE_ABS_PINNED_SOFT); sock_hold(sk); } return true; } static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk) { const struct rb_node *node = sk->tcp_rtx_queue.rb_node; /* No skb in the rtx queue. */ if (!node) return true; /* Only one skb in rtx queue. */ return !node->rb_left && !node->rb_right; } /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * (These limits are doubled for retransmits) * This allows for : * - better RTT estimation and ACK scheduling * - faster recovery * - high rates * Alas, some drivers / subsystems require a fair amount * of queued bytes to ensure line rate. * One example is wifi aggregation (802.11 AMPDU) */ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, unsigned int factor) { unsigned long limit; limit = max_t(unsigned long, 2 * skb->truesize, READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift)); limit = min_t(unsigned long, limit, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); limit <<= factor; if (static_branch_unlikely(&tcp_tx_delay_enabled) && tcp_sk(sk)->tcp_tx_delay) { u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) * tcp_sk(sk)->tcp_tx_delay; /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we * approximate our needs assuming an ~100% skb->truesize overhead. * USEC_PER_SEC is approximated by 2^20. * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. */ extra_bytes >>= (20 - 1); limit += extra_bytes; } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty or has one skb. * No need to wait for TX completion to call us back, * after softirq schedule. * This helps when TX completions are delayed too much. */ if (tcp_rtx_queue_empty_or_single_skb(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. */ smp_mb__after_atomic(); if (refcount_read(&sk->sk_wmem_alloc) > limit) return true; } return false; } static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { const u32 now = tcp_jiffies32; enum tcp_chrono old = tp->chrono_type; if (old > TCP_CHRONO_UNSPEC) tp->chrono_stat[old - 1] += now - tp->chrono_start; tp->chrono_start = now; tp->chrono_type = new; } void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* If there are multiple conditions worthy of tracking in a * chronograph then the highest priority enum takes precedence * over the other conditions. So that if something "more interesting" * starts happening, stop the previous chrono and start a new one. */ if (type > tp->chrono_type) tcp_chrono_set(tp, type); } void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* There are multiple conditions worthy of tracking in a * chronograph, so that the highest priority enum takes * precedence over the other conditions (see tcp_chrono_start). * If a condition stops, we only stop chrono tracking if * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } /* First skb in the write queue is smaller than ideal packet size. * Check if we can move payload from the second skb in the queue. */ static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) { struct sk_buff *next_skb = skb->next; unsigned int nlen; if (tcp_skb_is_last(sk, skb)) return; if (!tcp_skb_can_collapse(skb, next_skb)) return; nlen = min_t(u32, amount, next_skb->len); if (!nlen || !skb_shift(skb, next_skb, nlen)) return; TCP_SKB_CB(skb)->end_seq += nlen; TCP_SKB_CB(next_skb)->seq += nlen; if (!next_skb->len) { /* In case FIN is set, we need to update end_seq */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; tcp_eat_one_skb(sk, skb, next_skb); } } /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. * * LARGESEND note: !tcp_urg_mode is overkill, only frames between * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * * Send at most one packet when push_one > 0. Temporarily ignore * cwnd limit to force at most one packet out when push_one == 2. * Returns true, if no segments are in flight and we have queued segments, * but cannot send anything now because of SWS or another problem. */ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; u32 cwnd_quota, max_segs; int result; bool is_cwnd_limited = false, is_rwnd_limited = false; sent_pkts = 0; tcp_mstamp_refresh(tp); /* AccECN option beacon depends on mstamp, it may change mss */ if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk)) mss_now = tcp_current_mss(sk); if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { return false; } else if (result > 0) { sent_pkts = 1; } } max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; int missing_bytes; if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ tp->tcp_wstamp_ns = tp->tcp_clock_cache; skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ } if (tcp_pacing_check(sk)) break; cwnd_quota = tcp_cwnd_test(tp); if (!cwnd_quota) { if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; else break; } cwnd_quota = min(cwnd_quota, max_segs); missing_bytes = cwnd_quota * mss_now - skb->len; if (missing_bytes > 0) tcp_grow_skb(sk, skb, missing_bytes); tso_segs = tcp_set_skb_tso_segs(skb, mss_now); if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; break; } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) break; } else { if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, &is_rwnd_limited, max_segs)) break; } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota, nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; if (tcp_small_queue_check(sk, skb, 0)) break; /* Argh, we hit an empty skb(), presumably a thread * is sleeping in sendmsg()/sk_stream_wait_memory(). * We do not want to send a pure-ack packet and have * a strange looking rtx queue with empty packet(s). */ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) break; if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; repair: /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts += tcp_skb_pcount(skb); if (push_one) break; } if (is_rwnd_limited) tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED); else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)); if (likely(sent_pkts || is_cwnd_limited)) tcp_cwnd_validate(sk, is_cwnd_limited); if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, timeout_us, rto_delta_us; int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (rcu_access_pointer(tp->fastopen_rsk)) return false; early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans); /* Schedule a loss probe in 2*RTT for SACK capable connections * not in loss recovery, that are either limited by cwnd or application. */ if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || (icsk->icsk_ca_state != TCP_CA_Open && icsk->icsk_ca_state != TCP_CA_CWR)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account * for delayed ack when there's one outstanding packet. If no RTT * sample is available then probe after TCP_TIMEOUT_INIT. */ if (tp->srtt_us) { timeout_us = tp->srtt_us >> 2; if (tp->packets_out == 1) timeout_us += tcp_rto_min_us(sk); else timeout_us += TCP_TIMEOUT_MIN_US; timeout = usecs_to_jiffies(timeout_us); } else { timeout = TCP_TIMEOUT_INIT; } /* If the RTO formula yields an earlier time, then use that time. */ rto_delta_us = advancing_rto ? jiffies_to_usecs(inet_csk(sk)->icsk_rto) : tcp_rto_delta_us(sk); /* How far in future is RTO? */ if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, true); return true; } /* Thanks to skb fast clones, we can detect if a prior transmit of * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! */ static bool skb_still_in_host_queue(struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); smp_mb__after_atomic(); if (skb_fclone_busy(sk, skb)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; } } return false; } /* When probe timeout (PTO) fires, try send a new segment if possible, else * retransmit the last segment. */ void tcp_send_loss_probe(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int pcount; int mss = tcp_current_mss(sk); /* At most one outstanding TLP */ if (tp->tlp_high_seq) goto rearm_timer; tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb && tcp_snd_wnd_test(tp, skb, mss)) { pcount = tp->packets_out; tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); if (tp->packets_out > pcount) goto probe_sent; goto rearm_timer; } skb = skb_rb_last(&sk->tcp_rtx_queue); if (unlikely(!skb)) { tcp_warn_once(sk, tp->packets_out, "invalid inflight: "); smp_store_release(&inet_csk(sk)->icsk_pending, 0); return; } if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; pcount = tcp_skb_pcount(skb); if (WARN_ON(!pcount)) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; tp->tlp_retrans = 1; probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ smp_store_release(&inet_csk(sk)->icsk_pending, 0); rearm_timer: tcp_rearm_rto(sk); } /* Push out any pending frames which were held back due to * TCP_CORK or attempt at coalescing tiny packets. * The socket must be locked by the caller. */ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle) { /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and * all will be happy. */ if (unlikely(sk->sk_state == TCP_CLOSE)) return; if (tcp_write_xmit(sk, cur_mss, nonagle, 0, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */ void tcp_push_one(struct sock *sk, unsigned int mss_now) { struct sk_buff *skb = tcp_send_head(sk); BUG_ON(!skb || skb->len < mss_now); tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); } /* This function returns the amount that we can raise the * usable window based on the following constraints * * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * RFC 1122: * "the suggested [SWS] avoidance algorithm for the receiver is to keep * RECV.NEXT + RCV.WIN fixed until: * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" * * i.e. don't raise the right edge of the window until you can raise * it at least MSS bytes. * * Unfortunately, the recommended algorithm breaks header prediction, * since header prediction assumes th->window stays fixed. * * Strictly speaking, keeping th->window fixed violates the receiver * side SWS prevention criteria. The problem is that under this rule * a stream of single byte packets will cause the right side of the * window to always advance by a single byte. * * Of course, if the sender implements sender side SWS prevention * then this will not be a problem. * * BSD seems to make the following compromise: * * If the free space is less than the 1/4 of the maximum * space available and the free space is less than 1/2 mss, * then set the window to 0. * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] * Otherwise, just prevent the window from shrinking * and from being larger than the largest representable value. * * This prevents incremental opening of the window in the regime * where TCP is limited by the speed of the reader side taking * data out of the TCP receive queue. It does nothing about * those cases where the window is constrained on the sender side * because the pipeline is full. * * BSD also seems to "accidentally" limit itself to windows that are a * multiple of MSS, at least until the free space gets quite small. * This would appear to be a side effect of the mbuf implementation. * Combining these two algorithms results in the observed behavior * of having a fixed window size at almost all times. * * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. * Regular options like TIMESTAMP are taken into account. */ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int allowed_space = tcp_full_space(sk); int full_space, window; if (sk_is_mptcp(sk)) mptcp_space(sk, &free_space, &allowed_space); full_space = min_t(int, tp->window_clamp, allowed_space); if (unlikely(mss > full_space)) { mss = full_space; if (mss <= 0) return 0; } /* Only allow window shrink if the sysctl is enabled and we have * a non-zero scaling factor in effect. */ if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale) goto shrink_window_allowed; /* do not allow window to shrink */ if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* free_space might become our new window, make sure we don't * increase it due to wscale. */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); /* if free space is less than mss estimate, or is below 1/16th * of the maximum allowed, try to move to zero-window, else * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and * new incoming data is dropped due to memory limits. * With large window, mss test triggers way too late in order * to announce zero window in time before rmem limit kicks in. */ if (free_space < (allowed_space >> 4) || free_space < mss) return 0; } if (free_space > tp->rcv_ssthresh) free_space = tp->rcv_ssthresh; /* Don't do rounding if we are using window scaling, since the * scaled window will not line up with the MSS boundary anyway. */ if (tp->rx_opt.rcv_wscale) { window = free_space; /* Advertise enough space so that it won't get scaled away. * Import case: prevent zero window announcement if * 1<<rcv_wscale > mss. */ window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); } else { window = tp->rcv_wnd; /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. * If our current window offering is within 1 mss of the * free space we just keep it. This prevents the divide * and multiply from happening most of the time. * We also don't do any window rounding when the free space * is too small. */ if (window <= free_space - mss || window > free_space) window = rounddown(free_space, mss); else if (mss == full_space && free_space > window + (full_space >> 1)) window = free_space; } return window; shrink_window_allowed: /* new window should always be an exact multiple of scaling factor */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* if free space is too low, return a zero window */ if (free_space < (allowed_space >> 4) || free_space < mss || free_space < (1 << tp->rx_opt.rcv_wscale)) return 0; } if (free_space > tp->rcv_ssthresh) { free_space = tp->rcv_ssthresh; /* new window should always be an exact multiple of scaling factor * * For this case, we ALIGN "up" (increase free_space) because * we know free_space is not zero here, it has been reduced from * the memory-based limit, and rcv_ssthresh is not a hard limit * (unlike sk_rcvbuf). */ free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale)); } return free_space; } void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb) { if (unlikely(tcp_has_tx_tstamp(next_skb))) { const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); struct skb_shared_info *shinfo = skb_shinfo(skb); shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tskey = next_shinfo->tskey; TCP_SKB_CB(skb)->txstamp_ack |= TCP_SKB_CB(next_skb)->txstamp_ack; } } /* Collapses two adjacent SKB's during retransmission. */ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = skb_rb_next(skb); int next_skb_size; next_skb_size = next_skb->len; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size)) return false; tcp_highest_sack_replace(sk, next_skb, skb); /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; /* Merge over control information. This moves PSH/FIN etc. over */ TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ if (next_skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = skb; tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); tcp_skb_collapse_tstamp(skb, next_skb); tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } /* Check if coalescing SKBs is legal. */ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) return false; if (skb_cloned(skb)) return false; if (!skb_frags_readable(skb)) return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; return true; } /* Collapse packets in the retransmit queue to make to create * less packets on the wire. This is only done on retransmission. */ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, int space) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = to, *tmp; bool first = true; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; if (first) { first = false; continue; } if (space < 0) break; if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) break; if (!tcp_collapse_retrans(sk, to)) break; } } /* This retransmits one SKB. Policy decisions and retransmit queue * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; int diff, len, err; int avail_wnd; /* Inconclusive MTU probe */ if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; if (skb_still_in_host_queue(sk, skb)) { err = -EBUSY; goto out; } start: if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; TCP_SKB_CB(skb)->seq++; goto start; } if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { WARN_ON_ONCE(1); err = -EINVAL; goto out; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) { err = -ENOMEM; goto out; } } if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) { err = -EHOSTUNREACH; /* Routing failure or similar. */ goto out; } cur_mss = tcp_current_mss(sk); avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case * our retransmit of one segment serves as a zero window probe. */ if (avail_wnd <= 0) { if (TCP_SKB_CB(skb)->seq != tp->snd_una) { err = -EAGAIN; goto out; } avail_wnd = cur_mss; } len = cur_mss * segs; if (len > avail_wnd) { len = rounddown(avail_wnd, cur_mss); if (!len) len = avail_wnd; } if (skb->len > len) { if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, cur_mss, GFP_ATOMIC)) { err = -ENOMEM; /* We'll try again later. */ goto out; } } else { if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) { err = -ENOMEM; goto out; } diff = tcp_skb_pcount(skb); tcp_set_skb_tso_segs(skb, cur_mss); diff -= tcp_skb_pcount(skb); if (diff) tcp_adjust_pcount(sk, skb, diff); avail_wnd = min_t(int, avail_wnd, cur_mss); if (skb->len < avail_wnd) tcp_retrans_try_collapse(sk, skb, avail_wnd); } /* RFC3168, section 6.1.1.1. ECN fallback * As AccECN uses the same SYN flags (+ AE), this check covers both * cases. */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb); TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. */ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); if (nskb) { nskb->dev = NULL; err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC); } else { err = -ENOBUFS; } } tcp_skb_tsorted_restore(skb); if (!err) { tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns); tcp_rate_skb_sent(sk, skb); } } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); if (unlikely(err) && err != -EBUSY) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); /* To avoid taking spuriously low RTT samples based on a timestamp * for a transmit that never happened, always mark EVER_RETRANS */ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; out: trace_tcp_retransmit_skb(sk, skb, err); return err; } int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct tcp_sock *tp = tcp_sk(sk); int err = __tcp_retransmit_skb(sk, skb, segs); if (err == 0) { #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); } #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); } /* Save stamp of the first (attempted) retransmit. */ if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb); if (tp->undo_retrans < 0) tp->undo_retrans = 0; tp->undo_retrans += tcp_skb_pcount(skb); return err; } /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. */ void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); bool rearm_timer = false; u32 max_segs; int mib_idx; if (!tp->packets_out) return; rtx_head = tcp_rtx_queue_head(sk); skb = tp->retransmit_skb_hint ?: rtx_head; max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); skb_rbtree_walk_from(skb) { __u8 sacked; int segs; if (tcp_pacing_check(sk)) break; /* we could do better than to assign each time */ if (!hole) tp->retransmit_skb_hint = skb; segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp); if (segs <= 0) break; sacked = TCP_SKB_CB(skb)->sacked; /* In case tcp_shift_skb_data() have aggregated large skbs, * we need to make sure not sending too bigs TSO packets */ segs = min_t(int, segs, max_segs); if (tp->retrans_out >= tp->lost_out) { break; } else if (!(sacked & TCPCB_LOST)) { if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; } else { if (icsk->icsk_ca_state != TCP_CA_Loss) mib_idx = LINUX_MIB_TCPFASTRETRANS; else mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; } if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; if (tcp_small_queue_check(sk, skb, 1)) break; if (tcp_retransmit_skb(sk, skb, segs)) break; NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) rearm_timer = true; } if (rearm_timer) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, true); } /* We allow to exceed memory limits for FIN packets to expedite * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. * In general, we want to allow one skb per socket to avoid hangs * with edge trigger epoll() */ void sk_forced_mem_schedule(struct sock *sk, int size) { int delta, amt; delta = size - sk->sk_forward_alloc; if (delta <= 0) return; amt = sk_mem_pages(delta); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); if (mem_cgroup_sk_enabled(sk)) mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); if (sk->sk_bypass_prot_mem) return; sk_memory_allocated_add(sk, amt); } /* Send a FIN. The caller locks the socket for us. * We should try to send a FIN packet really hard, but eventually give up. */ void tcp_send_fin(struct sock *sk) { struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and * this skb was not yet sent, or we are under memory pressure. * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ tskb = tail; if (!tskb && tcp_under_memory_pressure(sk)) tskb = skb_rb_last(&sk->tcp_rtx_queue); if (tskb) { TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; if (!tail) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); return; } } else { skb = alloc_skb_fclone(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!skb)) return; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, sk, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); } /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there * was unread data in the receive queue. This behavior is recommended * by RFC 2525, section 2.17. -DaveM */ void tcp_send_active_reset(struct sock *sk, gfp_t priority, enum sk_rst_reason reason) { struct sk_buff *skb; TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, sk, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ trace_tcp_send_reset(sk, NULL, reason); } /* Send a crossed SYN-ACK during socket establishment. * WARNING: This routine must only be called when we have already sent * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. */ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = skb_copy(skb, GFP_ATOMIC); } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); tcp_highest_sack_replace(sk, skb, nskb); tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = nskb; } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; tcp_ecn_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /** * tcp_make_synack - Allocate one skb and build a SYNACK packet. * @sk: listener socket * @dst: dst entry attached to the SYNACK. It is consumed and caller * should not use it again. * @req: request_sock pointer * @foc: cookie for tcp fast open * @synack_type: Type of synack to prepare * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); struct tcp_out_options opts; struct tcp_key key = {}; struct sk_buff *skb; int tcp_header_size; struct tcphdr *th; int mss; u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; } /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); switch (synack_type) { case TCP_SYNACK_NORMAL: skb_set_owner_edemux(skb, req_to_sk(req)); break; case TCP_SYNACK_COOKIE: /* Under synflood, we do not attach skb to a socket, * to avoid false sharing. */ break; case TCP_SYNACK_FASTOPEN: /* sk is a const pointer, because we want to express multiple * cpu might call us concurrently. * sk->sk_wmem_alloc in an atomic, we can promote to rw. */ skb_set_owner_w(skb, (struct sock *)sk); break; } skb_dst_set(skb, dst); mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb_set_delivery_time(skb, cookie_init_timestamp(req, now), SKB_CLOCK_MONOTONIC); else #endif { skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); #endif if (tcp_rsk_used_ao(req)) { #ifdef CONFIG_TCP_AO struct tcp_ao_key *ao_key = NULL; u8 keyid = tcp_rsk(req)->ao_keyid; u8 rnext = tcp_rsk(req)->ao_rcv_next; ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req), keyid, -1); /* If there is no matching key - avoid sending anything, * especially usigned segments. It could try harder and lookup * for another peer-matching key, but the peer has requested * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here. */ if (unlikely(!ao_key)) { trace_tcp_ao_synack_no_key(sk, keyid, rnext); rcu_read_unlock(); kfree_skb(skb); net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n", keyid); return NULL; } key.ao_key = ao_key; key.type = TCP_KEY_AO; #endif } else { #ifdef CONFIG_TCP_MD5SIG key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4); /* bpf program will be interested in the tcp_flags */ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &key, foc, synack_type, syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); th = (struct tcphdr *)skb->data; memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; skb->ip_summed = CHECKSUM_PARTIAL; th->seq = htonl(tcp_rsk(req)->snt_isn); /* XXX data is queued and acked as is. No buffer/window check */ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write(th, NULL, tcp_rsk(req), &opts, &key); th->doff = (tcp_header_size >> 2); TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); /* Okay, we have all we need - do the md5 hash if needed */ if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, req_to_sk(req), skb); #endif } else if (tcp_key_is_ao(&key)) { #ifdef CONFIG_TCP_AO tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location, key.ao_key, req, skb, opts.hash_location - (u8 *)th, 0); #endif } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_unlock(); #endif bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts); skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); tcp_add_tx_delay(skb, tp); return skb; } EXPORT_IPV6_MOD(tcp_make_synack); static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); if (ca_key == TCP_CA_UNSPEC) return; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; } rcu_read_unlock(); } /* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; u16 user_mss; u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; tcp_ao_connect_init(sk); /* If user gave his TCP_MAXSEG, record it to clamp */ user_mss = READ_ONCE(tp->rx_opt.user_mss); if (user_mss) tp->rx_opt.mss_clamp = user_mss; tp->max_window = 0; tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); tcp_ca_dst_init(sk, dst); if (!tp->window_clamp) WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW)); tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) WRITE_ONCE(tp->window_clamp, tcp_full_space(sk)); rcv_wnd = tcp_rwnd_init_bpf(sk); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling), &rcv_wscale, rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; WRITE_ONCE(sk->sk_err, 0); sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; tcp_init_wl(tp, 0); tcp_write_queue_purge(sk); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; WRITE_ONCE(tp->snd_nxt, tp->write_seq); if (likely(!tp->repair)) tp->rcv_nxt = 0; else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); tcp_clear_retrans(tp); } static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->end_seq += skb->len; __skb_header_release(skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); } /* Build and send a SYN with data and (cached) Fast Open cookie. However, * queue a data-only packet after the regular SYN, such that regular SYNs * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges * only the SYN sequence, the data are retransmitted in the first ACK. * If cookie is not cached or other error occurs, falls back to send a * regular SYN with Fast Open cookie request option. */ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; struct page_frag *pfrag = sk_page_frag(sk); struct sk_buff *syn_data; int space, err = 0; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) goto fallback; /* MSS for SYN-data is based on cached MSS and bounded by PMTU and * user-MSS. Reserve maximum option space for middleboxes that add * private TCP options. The cost is reduced data space in SYN :( */ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); /* Sync mss_cache after updating the mss_clamp */ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; space = min_t(size_t, space, fo->size); if (space && !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE), pfrag, sk->sk_allocation)) goto fallback; syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false); if (!syn_data) goto fallback; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { space = min_t(size_t, space, pfrag->size - pfrag->offset); space = tcp_wmem_schedule(sk, space); } if (space) { space = copy_page_from_iter(pfrag->page, pfrag->offset, space, &fo->data->msg_iter); if (unlikely(!space)) { tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } skb_fill_page_desc(syn_data, 0, pfrag->page, pfrag->offset, space); page_ref_inc(pfrag->page); pfrag->offset += space; skb_len_add(syn_data, space); skb_zcopy_set(syn_data, fo->uarg, NULL); } /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) fo->data = NULL; fo->copied = space; tcp_connect_queue_skb(sk, syn_data); if (syn_data->len) tcp_chrono_start(sk, TCP_CHRONO_BUSY); err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) * we keep in write queue in case of a retransmit, as we * also have the SYN packet (with no data) in the same queue. */ TCP_SKB_CB(syn_data)->seq++; TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } /* data was not sent, put it in write_queue */ __skb_queue_tail(&sk->sk_write_queue, syn_data); tp->packets_out -= tcp_skb_pcount(syn_data); fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) fo->cookie.len = 0; err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); if (err) tp->syn_fastopen = 0; done: fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ return err; } /* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); #if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO) /* Has to be checked late, after setting daddr/saddr/ops. * Return error if the peer has both a md5 and a tcp-ao key * configured as this is ambiguous. */ if (unlikely(rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)))) { bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -1, -1); bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk); struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(sk)); if (ao_info) { /* This is an extra check: tcp_ao_required() in * tcp_v{4,6}_parse_md5_keys() should prevent adding * md5 keys on ao_required socket. */ needs_ao |= ao_info->ao_required; WARN_ON_ONCE(ao_info->ao_required && needs_md5); } if (needs_md5 && needs_ao) return -EKEYREJECTED; /* If we have a matching md5 key and no matching tcp-ao key * then free up ao_info if allocated. */ if (needs_md5) { tcp_ao_destroy_sock(sk, false); } else if (needs_ao) { tcp_clear_md5_list(sk); kfree(rcu_replace_pointer(tp->md5sig_info, NULL, lockdep_sock_is_held(sk))); } } #endif #ifdef CONFIG_TCP_AO if (unlikely(rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk)))) { /* Don't allow connecting if ao is configured but no * matching key is found. */ if (!tp->af_specific->ao_lookup(sk, sk, -1, -1)) return -EKEYREJECTED; } #endif if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ tcp_connect_init(sk); if (unlikely(tp->repair)) { tcp_finish_connect(sk, NULL); return 0; } buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; /* SYN eats a sequence byte, write_seq updated by * tcp_connect_queue_skb(). */ tcp_init_nondata_skb(buff, sk, tp->write_seq, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED) return err; /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ WRITE_ONCE(tp->snd_nxt, tp->write_seq); tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); tp->pushed_seq = TCP_SKB_CB(buff)->seq; } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, false); return 0; } EXPORT_SYMBOL(tcp_connect); u32 tcp_delack_max(const struct sock *sk) { u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1; return min(READ_ONCE(inet_csk(sk)->icsk_delack_max), delack_from_rto_min); } /* Send out a delayed ack, the caller does the policy checking * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() * for details. */ void tcp_send_delayed_ack(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int ato = icsk->icsk_ack.ato; unsigned long timeout; if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; if (inet_csk_in_pingpong_mode(sk) || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ /* If some rtt estimate is known, use it to bound delayed ack. * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ if (tp->srtt_us) { int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; } ato = min(ato, max_ato); } ato = min_t(u32, ato, tcp_delack_max(sk)); /* Stay within the limit we were given */ timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer is about to expire, send ACK now. */ if (time_before_eq(icsk_delack_timeout(icsk), jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } if (!time_before(timeout, icsk_delack_timeout(icsk))) timeout = icsk_delack_timeout(icsk); } smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags) { struct sk_buff *buff; /* If we have been reset, we may not send again. */ if (sk->sk_state == TCP_CLOSE) return; /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. */ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!buff)) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned long delay; delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; if (delay < tcp_rto_max(sk)) icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, false); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, sk, tcp_acceptable_seq(sk), TCPHDR_ACK | flags); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 */ skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); } EXPORT_SYMBOL_GPL(__tcp_send_ack); void tcp_send_ack(struct sock *sk) { __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt, 0); } /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. * * Question: what should we make while urgent mode? * 4.4BSD forces sending single byte of data. We cannot send * out of window data, because we have SND.NXT==SND.MAX... * * Current solution: to send TWO zero-length segments in urgent mode: * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (!skb) return -1; /* Reserve space for headers and set control bits. */ skb_reserve(skb, MAX_TCP_HEADER); /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just * send it. */ tcp_init_nondata_skb(skb, sk, tp->snd_una - !urgent, TCPHDR_ACK); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } /* Called from setsockopt( ... TCP_REPAIR ) */ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; tcp_mstamp_refresh(tcp_sk(sk)); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } /* Initiate keepalive or window probe from timer. */ int tcp_write_wakeup(struct sock *sk, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if (sk->sk_state == TCP_CLOSE) return -1; skb = tcp_send_head(sk); if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { int err; unsigned int mss = tcp_current_mss(sk); unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; /* We are probing the opening of a window * but the window size is != 0 * must have been a result SWS avoidance ( sender ) */ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); return err; } else { if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) tcp_xmit_probe_skb(sk, 1, mib); return tcp_xmit_probe_skb(sk, 0, mib); } } /* A window probe timeout has occurred. If window is not closed send * a partial packet else a zero probe. */ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); unsigned long timeout; int err; err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_backoff = 0; icsk->icsk_probes_tstamp = 0; return; } WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1); if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; timeout = tcp_probe0_when(sk, tcp_rto_max(sk)); } else { /* If packet was not sent due to local congestion, * Let senders fight for local resources conservatively. */ timeout = TCP_RESOURCE_PROBE_INTERVAL; } timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, true); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; int res; /* Paired with WRITE_ONCE() in sock_setsockopt() */ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash()); res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, NULL); if (!res) { TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); if (unlikely(tcp_passive_fastopen(sk))) { /* sk has const attribute because listeners are lockless. * However in this case, we are dealing with a passive fastopen * socket thus we can change total_retrans value. */ tcp_sk_rw(sk)->total_retrans++; } trace_tcp_retransmit_synack(sk, req); WRITE_ONCE(req->num_retrans, req->num_retrans + 1); } return res; } EXPORT_IPV6_MOD(tcp_rtx_synack);
132 70 109 6 99 340 340 677 662 39 676 676 661 40 605 162 621 65 106 83 83 201 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 // SPDX-License-Identifier: GPL-2.0-or-later /* * fs/inotify_user.c - inotify support for userspace * * Authors: * John McCutchan <ttb@tentacle.dhs.org> * Robert Love <rml@novell.com> * * Copyright (C) 2005 John McCutchan * Copyright 2006 Hewlett-Packard Development Company, L.P. * * Copyright (C) 2009 Eric Paris <Red Hat Inc> * inotify was largely rewritten to make use of the fsnotify infrastructure */ #include <linux/dcache.h> /* d_unlinked */ #include <linux/fs.h> /* struct inode */ #include <linux/fsnotify_backend.h> #include <linux/inotify.h> #include <linux/path.h> /* struct path */ #include <linux/slab.h> /* kmem_* */ #include <linux/types.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/sched/mm.h> #include "inotify.h" /* * Check if 2 events contain the same information. */ static bool event_compare(struct fsnotify_event *old_fsn, struct fsnotify_event *new_fsn) { struct inotify_event_info *old, *new; old = INOTIFY_E(old_fsn); new = INOTIFY_E(new_fsn); if (old->mask & FS_IN_IGNORED) return false; if ((old->mask == new->mask) && (old->wd == new->wd) && (old->name_len == new->name_len) && (!old->name_len || !strcmp(old->name, new->name))) return true; return false; } static int inotify_merge(struct fsnotify_group *group, struct fsnotify_event *event) { struct list_head *list = &group->notification_list; struct fsnotify_event *last_event; last_event = list_entry(list->prev, struct fsnotify_event, list); return event_compare(last_event, event); } int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *name, u32 cookie) { struct inotify_inode_mark *i_mark; struct inotify_event_info *event; struct fsnotify_event *fsn_event; struct fsnotify_group *group = inode_mark->group; int ret; int len = 0, wd; int alloc_len = sizeof(struct inotify_event_info); struct mem_cgroup *old_memcg; if (name) { len = name->len; alloc_len += len + 1; } pr_debug("%s: group=%p mark=%p mask=%x\n", __func__, group, inode_mark, mask); i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); /* * We can be racing with mark being detached. Don't report event with * invalid wd. */ wd = READ_ONCE(i_mark->wd); if (wd == -1) return 0; /* * Whoever is interested in the event, pays for the allocation. Do not * trigger OOM killer in the target monitoring memcg as it may have * security repercussion. */ old_memcg = set_active_memcg(group->memcg); event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); set_active_memcg(old_memcg); if (unlikely(!event)) { /* * Treat lost event due to ENOMEM the same way as queue * overflow to let userspace know event was lost. */ fsnotify_queue_overflow(group); return -ENOMEM; } /* * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events * for fanotify. inotify never reported IN_ISDIR with those events. * It looks like an oversight, but to avoid the risk of breaking * existing inotify programs, mask the flag out from those events. */ if (mask & (IN_MOVE_SELF | IN_DELETE_SELF)) mask &= ~IN_ISDIR; fsn_event = &event->fse; fsnotify_init_event(fsn_event); event->mask = mask; event->wd = wd; event->sync_cookie = cookie; event->name_len = len; if (len) strscpy(event->name, name->name, event->name_len + 1); ret = fsnotify_add_event(group, fsn_event, inotify_merge); if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); } if (inode_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT) fsnotify_destroy_mark(inode_mark, group); return 0; } static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { inotify_ignored_and_remove_idr(fsn_mark, group); } /* * This is NEVER supposed to be called. Inotify marks should either have been * removed from the idr when the watch was removed or in the * fsnotify_destroy_mark_by_group() call when the inotify instance was being * torn down. This is only called if the idr is about to be freed but there * are still marks in it. */ static int idr_callback(int id, void *p, void *data) { struct fsnotify_mark *fsn_mark; struct inotify_inode_mark *i_mark; static bool warned = false; if (warned) return 0; warned = true; fsn_mark = p; i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in " "idr. Probably leaking memory\n", id, p, data); /* * I'm taking the liberty of assuming that the mark in question is a * valid address and I'm dereferencing it. This might help to figure * out why we got here and the panic is no worse than the original * BUG() that was here. */ if (fsn_mark) printk(KERN_WARNING "fsn_mark->group=%p wd=%d\n", fsn_mark->group, i_mark->wd); return 0; } static void inotify_free_group_priv(struct fsnotify_group *group) { /* ideally the idr is empty and we won't hit the BUG in the callback */ idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_destroy(&group->inotify_data.idr); if (group->inotify_data.ucounts) dec_inotify_instances(group->inotify_data.ucounts); } static void inotify_free_event(struct fsnotify_group *group, struct fsnotify_event *fsn_event) { kfree(INOTIFY_E(fsn_event)); } /* ding dong the mark is dead */ static void inotify_free_mark(struct fsnotify_mark *fsn_mark) { struct inotify_inode_mark *i_mark; i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); kmem_cache_free(inotify_inode_mark_cachep, i_mark); } const struct fsnotify_ops inotify_fsnotify_ops = { .handle_inode_event = inotify_handle_inode_event, .free_group_priv = inotify_free_group_priv, .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, .free_mark = inotify_free_mark, };
102 2 2 31 31 29 28 12 25 25 19 12 9 1 1 12 12 2815 206 1702 160 2 2 1 1 31 2 15 3 11 15 14 15 16 12 23 8 12 23 11 12 11 15 1 2 15 1 13 11 11 10 1 33 6 24 11 7 7 5 11 15 1 1 34 34 3 2 1 1 1 2 2 2 1 1 1 2 1 1 29 1154 538 4570 4574 1702 2815 1154 1295 538 538 2018 547 2016 60 60 48 48 1 3 48 34 34 33 3 3 48 48 48 48 48 48 48 48 48 48 54 53 15 14 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Generic netlink support functions to configure an SMC-R PNET table * * Copyright IBM Corp. 2016 * * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com> */ #include <linux/module.h> #include <linux/list.h> #include <linux/ctype.h> #include <linux/mutex.h> #include <net/netlink.h> #include <net/genetlink.h> #include <uapi/linux/if.h> #include <uapi/linux/smc.h> #include <rdma/ib_verbs.h> #include <net/netns/generic.h> #include "smc_netns.h" #include "smc_pnet.h" #include "smc_ib.h" #include "smc_ism.h" #include "smc_core.h" static struct net_device *__pnet_find_base_ndev(struct net_device *ndev); static struct net_device *pnet_find_base_ndev(struct net_device *ndev); static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, .len = SMC_MAX_PNETID_LEN }, [SMC_PNETID_ETHNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [SMC_PNETID_IBNAME] = { .type = NLA_NUL_STRING, .len = IB_DEVICE_NAME_MAX - 1 }, [SMC_PNETID_IBPORT] = { .type = NLA_U8 } }; static struct genl_family smc_pnet_nl_family; enum smc_pnet_nametype { SMC_PNET_ETH = 1, SMC_PNET_IB = 2, }; /* pnet entry stored in pnet table */ struct smc_pnetentry { struct list_head list; char pnet_name[SMC_MAX_PNETID_LEN + 1]; enum smc_pnet_nametype type; union { struct { char eth_name[IFNAMSIZ + 1]; struct net_device *ndev; netdevice_tracker dev_tracker; }; struct { char ib_name[IB_DEVICE_NAME_MAX + 1]; u8 ib_port; }; }; }; /* Check if the pnetid is set */ bool smc_pnet_is_pnetid_set(u8 *pnetid) { if (pnetid[0] == 0 || pnetid[0] == _S) return false; return true; } /* Check if two given pnetids match */ static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2) { int i; for (i = 0; i < SMC_MAX_PNETID_LEN; i++) { if ((pnetid1[i] == 0 || pnetid1[i] == _S) && (pnetid2[i] == 0 || pnetid2[i] == _S)) break; if (pnetid1[i] != pnetid2[i]) return false; } return true; } /* Remove a pnetid from the pnet table. */ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct smc_ib_device *ibdev; struct smcd_dev *smcd; struct smc_net *sn; int rc = -ENOENT; int ibport; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; /* remove table entry */ mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) { netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); pr_warn_ratelimited("smc: net device %s " "erased user defined " "pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); } kfree(pnetelem); rc = 0; } } mutex_unlock(&pnettable->lock); /* if this is not the initial namespace, stop here */ if (net != &init_net) return rc; /* remove ib devices */ mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { if (ibdev->pnetid_by_user[ibport] && (!pnet_name || smc_pnet_match(pnet_name, ibdev->pnetid[ibport]))) { pr_warn_ratelimited("smc: ib device %s ibport " "%d erased user defined " "pnetid %.16s\n", ibdev->ibdev->name, ibport + 1, ibdev->pnetid[ibport]); memset(ibdev->pnetid[ibport], 0, SMC_MAX_PNETID_LEN); ibdev->pnetid_by_user[ibport] = false; rc = 0; } } } mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { if (smcd->pnetid_by_user && (!pnet_name || smc_pnet_match(pnet_name, smcd->pnetid))) { pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " "%.16s\n", dev_name(&smcd->dibs->dev), smcd->pnetid); memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); smcd->pnetid_by_user = false; rc = 0; } } mutex_unlock(&smcd_dev_list.mutex); return rc; } /* Add the reference to a given network device to the pnet table. */ static int smc_pnet_add_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { netdev_hold(ndev, &pnetelem->dev_tracker, GFP_ATOMIC); pnetelem->ndev = ndev; rc = 0; pr_warn_ratelimited("smc: adding net device %s with " "user defined pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); break; } } mutex_unlock(&pnettable->lock); return rc; } /* Remove the reference to a given network device from the pnet table. */ static int smc_pnet_remove_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); pnetelem->ndev = NULL; rc = 0; pr_warn_ratelimited("smc: removing net device %s with " "user defined pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); break; } } mutex_unlock(&pnettable->lock); return rc; } /* Apply pnetid to ib device when no pnetid is set. */ static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, char *pnet_name) { bool applied = false; mutex_lock(&smc_ib_devices.mutex); if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) { memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, SMC_MAX_PNETID_LEN); ib_dev->pnetid_by_user[ib_port - 1] = true; applied = true; } mutex_unlock(&smc_ib_devices.mutex); return applied; } /* Apply pnetid to smcd device when no pnetid is set. */ static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) { bool applied = false; mutex_lock(&smcd_dev_list.mutex); if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) { memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); smcd_dev->pnetid_by_user = true; applied = true; } mutex_unlock(&smcd_dev_list.mutex); return applied; } /* The limit for pnetid is 16 characters. * Valid characters should be (single-byte character set) a-z, A-Z, 0-9. * Lower case letters are converted to upper case. * Interior blanks should not be used. */ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) { char *bf = skip_spaces(pnet_name); size_t len = strlen(bf); char *end = bf + len; if (!len) return false; while (--end >= bf && isspace(*end)) ; if (end - bf >= SMC_MAX_PNETID_LEN) return false; while (bf <= end) { if (!isalnum(*bf)) return false; *pnetid++ = islower(*bf) ? toupper(*bf) : *bf; bf++; } *pnetid = '\0'; return true; } /* Find an infiniband device by a given name. The device might not exist. */ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name) { struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (!strncmp(ibdev->ibdev->name, ib_name, sizeof(ibdev->ibdev->name)) || (ibdev->ibdev->dev.parent && !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name, IB_DEVICE_NAME_MAX - 1))) { goto out; } } ibdev = NULL; out: mutex_unlock(&smc_ib_devices.mutex); return ibdev; } /* Find an smcd device by a given name. The device might not exist. */ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) { struct smcd_dev *smcd_dev; mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { if (!strncmp(dev_name(&smcd_dev->dibs->dev), smcd_name, IB_DEVICE_NAME_MAX - 1) || (smcd_dev->dibs->dev.parent && !strncmp(dev_name(smcd_dev->dibs->dev.parent), smcd_name, IB_DEVICE_NAME_MAX - 1))) goto out; } smcd_dev = NULL; out: mutex_unlock(&smcd_dev_list.mutex); return smcd_dev; } static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, char *eth_name, char *pnet_name) { struct smc_pnetentry *tmp_pe, *new_pe; struct net_device *ndev, *base_ndev; u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; bool new_netdev; int rc; /* check if (base) netdev already has a pnetid. If there is one, we do * not want to add a pnet table entry */ rc = -EEXIST; ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ if (ndev) { base_ndev = pnet_find_base_ndev(ndev); if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid)) goto out_put; } /* add a new netdev entry to the pnet table if there isn't one */ rc = -ENOMEM; new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); if (!new_pe) goto out_put; new_pe->type = SMC_PNET_ETH; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strscpy(new_pe->eth_name, eth_name); rc = -EEXIST; new_netdev = true; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_ETH && !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { new_netdev = false; break; } } if (new_netdev) { if (ndev) { new_pe->ndev = ndev; netdev_tracker_alloc(ndev, &new_pe->dev_tracker, GFP_ATOMIC); } list_add_tail(&new_pe->list, &pnettable->pnetlist); mutex_unlock(&pnettable->lock); } else { mutex_unlock(&pnettable->lock); kfree(new_pe); goto out_put; } if (ndev) pr_warn_ratelimited("smc: net device %s " "applied user defined pnetid %.16s\n", new_pe->eth_name, new_pe->pnet_name); return 0; out_put: dev_put(ndev); return rc; } static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, u8 ib_port, char *pnet_name) { struct smc_pnetentry *tmp_pe, *new_pe; struct smc_ib_device *ib_dev; bool smcddev_applied = true; bool ibdev_applied = true; struct smcd_dev *smcd; bool new_ibdev; /* try to apply the pnetid to active devices */ ib_dev = smc_pnet_find_ib(ib_name); if (ib_dev) { ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); if (ibdev_applied) pr_warn_ratelimited("smc: ib device %s ibport %d " "applied user defined pnetid " "%.16s\n", ib_dev->ibdev->name, ib_port, ib_dev->pnetid[ib_port - 1]); } smcd = smc_pnet_find_smcd(ib_name); if (smcd) { smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); if (smcddev_applied) { pr_warn_ratelimited("smc: smcd device %s applied user defined pnetid %.16s\n", dev_name(&smcd->dibs->dev), smcd->pnetid); } } /* Apply fails when a device has a hardware-defined pnetid set, do not * add a pnet table entry in that case. */ if (!ibdev_applied || !smcddev_applied) return -EEXIST; /* add a new ib entry to the pnet table if there isn't one */ new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); if (!new_pe) return -ENOMEM; new_pe->type = SMC_PNET_IB; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strscpy(new_pe->ib_name, ib_name); new_pe->ib_port = ib_port; new_ibdev = true; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { new_ibdev = false; break; } } if (new_ibdev) { list_add_tail(&new_pe->list, &pnettable->pnetlist); mutex_unlock(&pnettable->lock); } else { mutex_unlock(&pnettable->lock); kfree(new_pe); } return (new_ibdev) ? 0 : -EEXIST; } /* Append a pnetid to the end of the pnet table if not already on this list. */ static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) { char pnet_name[SMC_MAX_PNETID_LEN + 1]; struct smc_pnettable *pnettable; bool new_netdev = false; bool new_ibdev = false; struct smc_net *sn; u8 ibport = 1; char *string; int rc; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; rc = -EINVAL; if (!tb[SMC_PNETID_NAME]) goto error; string = (char *)nla_data(tb[SMC_PNETID_NAME]); if (!smc_pnetid_valid(string, pnet_name)) goto error; if (tb[SMC_PNETID_ETHNAME]) { string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); if (!rc) new_netdev = true; else if (rc != -EEXIST) goto error; } /* if this is not the initial namespace, stop here */ if (net != &init_net) return new_netdev ? 0 : -EEXIST; rc = -EINVAL; if (tb[SMC_PNETID_IBNAME]) { string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); string = strim(string); if (tb[SMC_PNETID_IBPORT]) { ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); if (ibport < 1 || ibport > SMC_MAX_PORTS) goto error; } rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); if (!rc) new_ibdev = true; else if (rc != -EEXIST) goto error; } return (new_netdev || new_ibdev) ? 0 : -EEXIST; error: return rc; } /* Convert an smc_pnetentry to a netlink attribute sequence */ static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem) { if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) return -1; if (pnetelem->type == SMC_PNET_ETH) { if (nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->eth_name)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) return -1; } if (pnetelem->type == SMC_PNET_IB) { if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) return -1; } return 0; } static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); return smc_pnet_enter(net, info->attrs); } static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); if (!info->attrs[SMC_PNETID_NAME]) return -EINVAL; return smc_pnet_remove_by_pnetid(net, (char *)nla_data(info->attrs[SMC_PNETID_NAME])); } static int smc_pnet_dump_start(struct netlink_callback *cb) { cb->args[0] = 0; return 0; } static int smc_pnet_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, struct smc_pnetentry *pnetelem) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family, flags, SMC_PNETID_GET); if (!hdr) return -ENOMEM; if (smc_pnet_set_nla(skb, pnetelem) < 0) { genlmsg_cancel(skb, hdr); return -EMSGSIZE; } genlmsg_end(skb, hdr); return 0; } static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u8 *pnetid, int start_idx) { struct smc_pnettable *pnettable; struct smc_pnetentry *pnetelem; struct smc_net *sn; int idx = 0; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; /* dump pnettable entries */ mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; if (idx++ < start_idx) continue; /* if this is not the initial namespace, dump only netdev */ if (net != &init_net && pnetelem->type != SMC_PNET_ETH) continue; if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, pnetelem)) { --idx; break; } } mutex_unlock(&pnettable->lock); return idx; } static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int idx; idx = _smc_pnet_dump(net, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NULL, cb->args[0]); cb->args[0] = idx; return skb->len; } /* Retrieve one PNETID entry */ static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct sk_buff *msg; void *hdr; if (!info->attrs[SMC_PNETID_NAME]) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; _smc_pnet_dump(net, msg, info->snd_portid, info->snd_seq, nla_data(info->attrs[SMC_PNETID_NAME]), 0); /* finish multi part message and send it */ hdr = nlmsg_put(msg, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, NLM_F_MULTI); if (!hdr) { nlmsg_free(msg); return -EMSGSIZE; } return genlmsg_reply(msg, info); } /* Remove and delete all pnetids from pnet table. */ static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); smc_pnet_remove_by_pnetid(net, NULL); return 0; } /* SMC_PNETID generic netlink operation definition */ static const struct genl_ops smc_pnet_ops[] = { { .cmd = SMC_PNETID_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .doit = smc_pnet_get, .dumpit = smc_pnet_dump, .start = smc_pnet_dump_start }, { .cmd = SMC_PNETID_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_add }, { .cmd = SMC_PNETID_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_del }, { .cmd = SMC_PNETID_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_flush } }; /* SMC_PNETID family definition */ static struct genl_family smc_pnet_nl_family __ro_after_init = { .hdrsize = 0, .name = SMCR_GENL_FAMILY_NAME, .version = SMCR_GENL_FAMILY_VERSION, .maxattr = SMC_PNETID_MAX, .policy = smc_pnet_policy, .netnsok = true, .module = THIS_MODULE, .ops = smc_pnet_ops, .n_ops = ARRAY_SIZE(smc_pnet_ops), .resv_start_op = SMC_PNETID_FLUSH + 1, }; bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe; bool rc = false; read_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pe, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pe->pnetid)) { rc = true; goto unlock; } } unlock: read_unlock(&sn->pnetids_ndev.lock); return rc; } static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *pi; pe = kzalloc(sizeof(*pe), GFP_KERNEL); if (!pe) return -ENOMEM; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pi, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pi->pnetid)) { refcount_inc(&pi->refcnt); kfree(pe); goto unlock; } } refcount_set(&pe->refcnt, 1); memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN); list_add_tail(&pe->list, &sn->pnetids_ndev.list); unlock: write_unlock(&sn->pnetids_ndev.lock); return 0; } static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *pe2; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pe->pnetid)) { if (refcount_dec_and_test(&pe->refcnt)) { list_del(&pe->list); kfree(pe); } break; } } write_unlock(&sn->pnetids_ndev.lock); } static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev, u8 *ndev_pnetid) { struct net_device *base_dev; base_dev = __pnet_find_base_ndev(dev); if (base_dev->flags & IFF_UP && !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port, ndev_pnetid)) { /* add to PNETIDs list */ smc_pnet_add_pnetid(net, ndev_pnetid); } } /* create initial list of netdevice pnetids */ static void smc_pnet_create_pnetids_list(struct net *net) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net_device *dev; /* Newly created netns do not have devices. * Do not even acquire rtnl. */ if (list_empty(&net->dev_base_head)) return; /* Note: This might not be needed, because smc_pnet_netdev_event() * is also calling smc_pnet_add_base_pnetid() when handling * NETDEV_UP event. */ rtnl_lock(); for_each_netdev(net, dev) smc_pnet_add_base_pnetid(net, dev, ndev_pnetid); rtnl_unlock(); } /* clean up list of netdevice pnetids */ static void smc_pnet_destroy_pnetids_list(struct net *net) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *temp_pe; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) { list_del(&pe->list); kfree(pe); } write_unlock(&sn->pnetids_ndev.lock); } static int smc_pnet_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(event_dev); u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; switch (event) { case NETDEV_REBOOT: case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_REGISTER: smc_pnet_add_by_ndev(event_dev); smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_UP: smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid); return NOTIFY_OK; case NETDEV_DOWN: event_dev = __pnet_find_base_ndev(event_dev); if (!smc_pnetid_by_dev_port(event_dev->dev.parent, event_dev->dev_port, ndev_pnetid)) { /* remove from PNETIDs list */ smc_pnet_remove_pnetid(net, ndev_pnetid); } return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block smc_netdev_notifier = { .notifier_call = smc_pnet_netdev_event }; /* init network namespace */ int smc_pnet_net_init(struct net *net) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnettable *pnettable = &sn->pnettable; struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev; INIT_LIST_HEAD(&pnettable->pnetlist); mutex_init(&pnettable->lock); INIT_LIST_HEAD(&pnetids_ndev->list); rwlock_init(&pnetids_ndev->lock); smc_pnet_create_pnetids_list(net); return 0; } int __init smc_pnet_init(void) { int rc; rc = genl_register_family(&smc_pnet_nl_family); if (rc) return rc; rc = register_netdevice_notifier(&smc_netdev_notifier); if (rc) genl_unregister_family(&smc_pnet_nl_family); return rc; } /* exit network namespace */ void smc_pnet_net_exit(struct net *net) { /* flush pnet table */ smc_pnet_remove_by_pnetid(net, NULL); smc_pnet_destroy_pnetids_list(net); } void smc_pnet_exit(void) { unregister_netdevice_notifier(&smc_netdev_notifier); genl_unregister_family(&smc_pnet_nl_family); } static struct net_device *__pnet_find_base_ndev(struct net_device *ndev) { int i, nest_lvl; ASSERT_RTNL(); nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; if (list_empty(lower)) break; lower = lower->next; ndev = netdev_lower_get_next(ndev, &lower); } return ndev; } /* Determine one base device for stacked net devices. * If the lower device level contains more than one devices * (for instance with bonding slaves), just the first device * is used to reach a base device. */ static struct net_device *pnet_find_base_ndev(struct net_device *ndev) { rtnl_lock(); ndev = __pnet_find_base_ndev(ndev); rtnl_unlock(); return ndev; } static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, u8 *pnetid) { struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_pnetentry *pnetelem; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; } static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, struct smc_init_info *ini) { if (!ini->check_smcrv2 && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL, NULL)) { ini->ib_dev = ibdev; ini->ib_port = i; return 0; } if (ini->check_smcrv2 && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2, NULL, &ini->smcrv2)) { ini->smcrv2.ib_dev_v2 = ibdev; ini->smcrv2.ib_port_v2 = i; return 0; } return -ENODEV; } /* find a roce device for the given pnetid */ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_init_info *ini, struct smc_ib_device *known_dev, struct net *net) { struct smc_ib_device *ibdev; int i; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (ibdev == known_dev || !rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && smc_ib_port_active(ibdev, i) && !test_bit(i - 1, ibdev->ports_going_away)) { if (!smc_pnet_determine_gid(ibdev, i, ini)) goto out; } } } out: mutex_unlock(&smc_ib_devices.mutex); } /* find alternate roce device with same pnet_id, vlan_id and net namespace */ void smc_pnet_find_alt_roce(struct smc_link_group *lgr, struct smc_init_info *ini, struct smc_ib_device *known_dev) { struct net *net = lgr->net; _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net); } /* if handshake network device belongs to a roce device, return its * IB device and port */ static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct smc_init_info *ini) { struct net *net = dev_net(netdev); struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { struct net_device *ndev; int i; /* check rdma net namespace */ if (!rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; ndev = ib_device_get_netdev(ibdev->ibdev, i); if (!ndev) continue; dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && !test_bit(i - 1, ibdev->ports_going_away)) { if (!smc_pnet_determine_gid(ibdev, i, ini)) break; } } } mutex_unlock(&smc_ib_devices.mutex); } /* Determine the corresponding IB device port based on the hardware PNETID. * Searching stops at the first matching active IB device port with vlan_id * configured. * If nothing found, check pnetid table. * If nothing found, try to use handshake device */ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net_device *base_ndev; struct net *net; base_ndev = pnet_find_base_ndev(ndev); net = dev_net(ndev); if (smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(base_ndev, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { smc_pnet_find_rdma_dev(base_ndev, ini); return; /* pnetid could not be determined */ } _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smcd_dev *ismdev; ndev = pnet_find_base_ndev(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) return; /* pnetid could not be determined */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && !ismdev->going_away && (!ini->ism_peer_gid[0].gid || !smc_ism_cantalk(&ini->ism_peer_gid[0], ini->vlan_id, ismdev))) { ini->ism_dev[0] = ismdev; break; } } mutex_unlock(&smcd_dev_list.mutex); } /* PNET table analysis for a given sock: * determine ib_device and port belonging to used internal TCP socket * ethernet interface. */ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct net_device *dev; struct dst_entry *dst; rcu_read_lock(); dst = __sk_dst_get(sk); dev = dst ? dst_dev_rcu(dst) : NULL; dev_hold(dev); rcu_read_unlock(); if (dev) { smc_pnet_find_roce_by_pnetid(dev, ini); dev_put(dev); } } void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { struct net_device *dev; struct dst_entry *dst; ini->ism_dev[0] = NULL; rcu_read_lock(); dst = __sk_dst_get(sk); dev = dst ? dst_dev_rcu(dst) : NULL; dev_hold(dev); rcu_read_unlock(); if (dev) { smc_pnet_find_ism_by_pnetid(dev, ini); dev_put(dev); } } /* Lookup and apply a pnet table entry to the given ib device. */ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) { char *ib_name = smcibdev->ibdev->name; struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for init namespace */ sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && tmp_pe->ib_port == ib_port) { smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; } /* Lookup and apply a pnet table entry to the given smcd device. */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for init namespace */ sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && (!strncmp(tmp_pe->ib_name, dev_name(&smcddev->dibs->dev), sizeof(tmp_pe->ib_name)) || (smcddev->dibs->dev.parent && !strncmp(tmp_pe->ib_name, dev_name(smcddev->dibs->dev.parent), sizeof(tmp_pe->ib_name))))) { smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; }
1716 1717 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ #include <linux/jump_label.h> #include <linux/uaccess.h> #include <linux/export.h> #include <linux/instrumented.h> #include <linux/string.h> #include <linux/types.h> #include <asm/mce.h> #ifdef CONFIG_X86_MCE static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key); void enable_copy_mc_fragile(void) { static_branch_inc(&copy_mc_fragile_key); } #define copy_mc_fragile_enabled (static_branch_unlikely(&copy_mc_fragile_key)) /* * Similar to copy_user_handle_tail, probe for the write fault point, or * source exception point. */ __visible notrace unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len) { for (; len; --len, to++, from++) if (copy_mc_fragile(to, from, 1)) break; return len; } #else /* * No point in doing careful copying, or consulting a static key when * there is no #MC handler in the CONFIG_X86_MCE=n case. */ void enable_copy_mc_fragile(void) { } #define copy_mc_fragile_enabled (0) #endif unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len); /** * copy_mc_to_kernel - memory copy that handles source exceptions * * @dst: destination address * @src: source address * @len: number of bytes to copy * * Call into the 'fragile' version on systems that benefit from avoiding * corner case poison consumption scenarios, For example, accessing * poison across 2 cachelines with a single instruction. Almost all * other uses case can use copy_mc_enhanced_fast_string() for a fast * recoverable copy, or fallback to plain memcpy. * * Return 0 for success, or number of bytes not copied if there was an * exception. */ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len) { unsigned long ret; if (copy_mc_fragile_enabled) { instrument_memcpy_before(dst, src, len); ret = copy_mc_fragile(dst, src, len); instrument_memcpy_after(dst, src, len, ret); return ret; } if (static_cpu_has(X86_FEATURE_ERMS)) { instrument_memcpy_before(dst, src, len); ret = copy_mc_enhanced_fast_string(dst, src, len); instrument_memcpy_after(dst, src, len, ret); return ret; } memcpy(dst, src, len); return 0; } EXPORT_SYMBOL_GPL(copy_mc_to_kernel); unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, unsigned len) { unsigned long ret; if (copy_mc_fragile_enabled) { instrument_copy_to_user(dst, src, len); __uaccess_begin(); ret = copy_mc_fragile((__force void *)dst, src, len); __uaccess_end(); return ret; } if (static_cpu_has(X86_FEATURE_ERMS)) { instrument_copy_to_user(dst, src, len); __uaccess_begin(); ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len); __uaccess_end(); return ret; } return copy_user_generic((__force void *)dst, src, len); }
3 3 2 1 2 2 2 1 1 538 538 538 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 // SPDX-License-Identifier: GPL-2.0 /* * xfrm6_policy.c: based on xfrm4_policy.c * * Authors: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * YOSHIFUJI Hideaki * Split up af-specific portion * */ #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/addrconf.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/l3mdev.h> static struct dst_entry *xfrm6_dst_lookup(const struct xfrm_dst_lookup_params *params) { struct flowi6 fl6; struct dst_entry *dst; int err; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl6.flowi6_mark = params->mark; memcpy(&fl6.daddr, params->daddr, sizeof(fl6.daddr)); if (params->saddr) memcpy(&fl6.saddr, params->saddr, sizeof(fl6.saddr)); fl6.flowi4_proto = params->ipproto; fl6.uli = params->uli; dst = ip6_route_output(params->net, NULL, &fl6); err = dst->error; if (dst->error) { dst_release(dst); dst = ERR_PTR(err); } return dst; } static int xfrm6_get_saddr(xfrm_address_t *saddr, const struct xfrm_dst_lookup_params *params) { struct dst_entry *dst; struct net_device *dev; struct inet6_dev *idev; dst = xfrm6_dst_lookup(params); if (IS_ERR(dst)) return -EHOSTUNREACH; idev = ip6_dst_idev(dst); if (!idev) { dst_release(dst); return -EHOSTUNREACH; } dev = idev->dev; ipv6_dev_get_saddr(dev_net(dev), dev, &params->daddr->in6, 0, &saddr->in6); dst_release(dst); return 0; } static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { struct rt6_info *rt = dst_rt6_info(xdst->route); xdst->u.dst.dev = dev; netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC); xdst->u.rt6.rt6i_idev = in6_dev_get(dev); if (!xdst->u.rt6.rt6i_idev) { netdev_put(dev, &xdst->u.dst.dev_tracker); return -ENODEV; } /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | RTF_LOCAL); xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; rt6_uncached_list_add(&xdst->u.rt6); return 0; } static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->redirect(path, sk, skb); } static void xfrm6_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; dst_destroy_metrics_generic(dst); rt6_uncached_list_del(&xdst->u.rt6); if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); xfrm_dst_destroy(xdst); } static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct xfrm_dst *xdst; xdst = (struct xfrm_dst *)dst; if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = in6_dev_get(dev_net(dev)->loopback_dev); do { in6_dev_put(xdst->u.rt6.rt6i_idev); xdst->u.rt6.rt6i_idev = loopback_idev; in6_dev_hold(loopback_idev); xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst); } while (xdst->u.dst.xfrm); __in6_dev_put(loopback_idev); } xfrm_dst_ifdown(dst, dev); } static struct dst_ops xfrm6_dst_ops_template = { .family = AF_INET6, .update_pmtu = xfrm6_update_pmtu, .redirect = xfrm6_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .dst_ops = &xfrm6_dst_ops_template, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, }; static int __init xfrm6_policy_init(void) { return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo, AF_INET6); } static void xfrm6_policy_fini(void) { xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); } #ifdef CONFIG_SYSCTL static struct ctl_table xfrm6_policy_table[] = { { .procname = "xfrm6_gc_thresh", .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static int __net_init xfrm6_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = xfrm6_policy_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(xfrm6_policy_table), GFP_KERNEL); if (!table) goto err_alloc; table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh; } hdr = register_net_sysctl_sz(net, "net/ipv6", table, ARRAY_SIZE(xfrm6_policy_table)); if (!hdr) goto err_reg; net->ipv6.sysctl.xfrm6_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { const struct ctl_table *table; if (!net->ipv6.sysctl.xfrm6_hdr) return; table = net->ipv6.sysctl.xfrm6_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.xfrm6_hdr); if (!net_eq(net, &init_net)) kfree(table); } #else /* CONFIG_SYSCTL */ static inline int xfrm6_net_sysctl_init(struct net *net) { return 0; } static inline void xfrm6_net_sysctl_exit(struct net *net) { } #endif static int __net_init xfrm6_net_init(struct net *net) { int ret; memcpy(&net->xfrm.xfrm6_dst_ops, &xfrm6_dst_ops_template, sizeof(xfrm6_dst_ops_template)); ret = dst_entries_init(&net->xfrm.xfrm6_dst_ops); if (ret) return ret; ret = xfrm6_net_sysctl_init(net); if (ret) dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); return ret; } static void __net_exit xfrm6_net_exit(struct net *net) { xfrm6_net_sysctl_exit(net); dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); } static struct pernet_operations xfrm6_net_ops = { .init = xfrm6_net_init, .exit = xfrm6_net_exit, }; int __init xfrm6_init(void) { int ret; ret = xfrm6_policy_init(); if (ret) goto out; ret = xfrm6_state_init(); if (ret) goto out_policy; ret = xfrm6_protocol_init(); if (ret) goto out_state; ret = register_pernet_subsys(&xfrm6_net_ops); if (ret) goto out_protocol; ret = xfrm_nat_keepalive_init(AF_INET6); if (ret) goto out_nat_keepalive; out: return ret; out_nat_keepalive: unregister_pernet_subsys(&xfrm6_net_ops); out_protocol: xfrm6_protocol_fini(); out_state: xfrm6_state_fini(); out_policy: xfrm6_policy_fini(); goto out; } void xfrm6_fini(void) { xfrm_nat_keepalive_fini(AF_INET6); unregister_pernet_subsys(&xfrm6_net_ops); xfrm6_protocol_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); }
3173 2051 7082 6698 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_USER_NAMESPACE_H #define _LINUX_USER_NAMESPACE_H #include <linux/kref.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/rculist_nulls.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/rcuref.h> #include <linux/rwsem.h> #include <linux/sysctl.h> #include <linux/err.h> #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ union { struct { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; u32 nr_extents; }; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; #define USERNS_SETGROUPS_ALLOWED 1UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED struct ucounts; enum ucount_type { UCOUNT_USER_NAMESPACES, UCOUNT_PID_NAMESPACES, UCOUNT_UTS_NAMESPACES, UCOUNT_IPC_NAMESPACES, UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif #ifdef CONFIG_FANOTIFY UCOUNT_FANOTIFY_GROUPS, UCOUNT_FANOTIFY_MARKS, #endif UCOUNT_COUNTS, }; enum rlimit_type { UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_RLIMIT_SIGPENDING, UCOUNT_RLIMIT_MEMLOCK, UCOUNT_RLIMIT_COUNTS, }; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc; #endif struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; struct uid_gid_map projid_map; struct user_namespace *parent; int level; kuid_t owner; kgid_t group; struct ns_common ns; unsigned long flags; /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP * in its effective capability set at the child ns creation time. */ bool parent_could_setfcap; #ifdef CONFIG_KEYS /* List of joinable keyrings in this namespace. Modification access of * these pointers is controlled by keyring_sem. Once * user_keyring_register is set, it won't be changed, so it can be * accessed directly with READ_ONCE(). */ struct list_head keyring_name_list; struct key *user_keyring_register; struct rw_semaphore keyring_sem; #endif /* Register of per-UID persistent keyrings for this namespace */ #ifdef CONFIG_PERSISTENT_KEYRINGS struct key *persistent_keyring_register; #endif struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; long ucount_max[UCOUNT_COUNTS]; long rlimit_max[UCOUNT_RLIMIT_COUNTS]; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc *binfmt_misc; #endif } __randomize_layout; struct ucounts { struct hlist_nulls_node node; struct user_namespace *ns; kuid_t uid; struct rcu_head rcu; rcuref_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; extern struct user_namespace init_user_ns; extern struct ucounts init_ucounts; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); void put_ucounts(struct ucounts *ucounts); static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) { if (rcuref_get(&ucounts->count)) return ucounts; return NULL; } static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { return atomic_long_read(&ucounts->rlimit[type]); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, bool override_rlimit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type) { return READ_ONCE(ns->rlimit_max[type]); } static inline void set_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type, unsigned long max) { ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) ns_ref_inc(ns); return ns; } extern int create_user_ns(struct cred *new); extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { if (ns && ns_ref_put(ns)) __put_user_ns(ns); } struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child); extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; } static inline int create_user_ns(struct cred *new) { return -EINVAL; } static inline int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { if (unshare_flags & CLONE_NEWUSER) return -EINVAL; return 0; } static inline void put_user_ns(struct user_namespace *ns) { } static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } static inline bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { return true; } static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; } static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } #endif #endif /* _LINUX_USER_H */
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* Copyright (c) 2019 Facebook */ #ifdef __KERNEL__ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/string.h> #include <linux/bpf_verifier.h> #include "relo_core.h" static const char *btf_kind_str(const struct btf_type *t) { return btf_type_str(t); } static bool is_ldimm64_insn(struct bpf_insn *insn) { return insn->code == (BPF_LD | BPF_IMM | BPF_DW); } static const struct btf_type * skip_mods_and_typedefs(const struct btf *btf, u32 id, u32 *res_id) { return btf_type_skip_modifiers(btf, id, res_id); } static const char *btf__name_by_offset(const struct btf *btf, u32 offset) { return btf_name_by_offset(btf, offset); } static s64 btf__resolve_size(const struct btf *btf, u32 type_id) { const struct btf_type *t; int size; t = btf_type_by_id(btf, type_id); t = btf_resolve_size(btf, t, &size); if (IS_ERR(t)) return PTR_ERR(t); return size; } enum libbpf_print_level { LIBBPF_WARN, LIBBPF_INFO, LIBBPF_DEBUG, }; #undef pr_warn #undef pr_info #undef pr_debug #define pr_warn(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) #define pr_info(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) #define pr_debug(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) #define libbpf_print(level, fmt, ...) bpf_log((void *)prog_name, fmt, ##__VA_ARGS__) #else #include <stdio.h> #include <string.h> #include <errno.h> #include <ctype.h> #include <linux/err.h> #include "libbpf.h" #include "bpf.h" #include "btf.h" #include "libbpf_internal.h" #endif static bool is_flex_arr(const struct btf *btf, const struct bpf_core_accessor *acc, const struct btf_array *arr) { const struct btf_type *t; /* not a flexible array, if not inside a struct or has non-zero size */ if (!acc->name || arr->nelems > 0) return false; /* has to be the last member of enclosing struct */ t = btf_type_by_id(btf, acc->type_id); return acc->idx == btf_vlen(t) - 1; } static const char *core_relo_kind_str(enum bpf_core_relo_kind kind) { switch (kind) { case BPF_CORE_FIELD_BYTE_OFFSET: return "byte_off"; case BPF_CORE_FIELD_BYTE_SIZE: return "byte_sz"; case BPF_CORE_FIELD_EXISTS: return "field_exists"; case BPF_CORE_FIELD_SIGNED: return "signed"; case BPF_CORE_FIELD_LSHIFT_U64: return "lshift_u64"; case BPF_CORE_FIELD_RSHIFT_U64: return "rshift_u64"; case BPF_CORE_TYPE_ID_LOCAL: return "local_type_id"; case BPF_CORE_TYPE_ID_TARGET: return "target_type_id"; case BPF_CORE_TYPE_EXISTS: return "type_exists"; case BPF_CORE_TYPE_MATCHES: return "type_matches"; case BPF_CORE_TYPE_SIZE: return "type_size"; case BPF_CORE_ENUMVAL_EXISTS: return "enumval_exists"; case BPF_CORE_ENUMVAL_VALUE: return "enumval_value"; default: return "unknown"; } } static bool core_relo_is_field_based(enum bpf_core_relo_kind kind) { switch (kind) { case BPF_CORE_FIELD_BYTE_OFFSET: case BPF_CORE_FIELD_BYTE_SIZE: case BPF_CORE_FIELD_EXISTS: case BPF_CORE_FIELD_SIGNED: case BPF_CORE_FIELD_LSHIFT_U64: case BPF_CORE_FIELD_RSHIFT_U64: return true; default: return false; } } static bool core_relo_is_type_based(enum bpf_core_relo_kind kind) { switch (kind) { case BPF_CORE_TYPE_ID_LOCAL: case BPF_CORE_TYPE_ID_TARGET: case BPF_CORE_TYPE_EXISTS: case BPF_CORE_TYPE_MATCHES: case BPF_CORE_TYPE_SIZE: return true; default: return false; } } static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) { switch (kind) { case BPF_CORE_ENUMVAL_EXISTS: case BPF_CORE_ENUMVAL_VALUE: return true; default: return false; } } int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id, int level) { const struct btf_type *local_type, *targ_type; int depth = 32; /* max recursion depth */ /* caller made sure that names match (ignoring flavor suffix) */ local_type = btf_type_by_id(local_btf, local_id); targ_type = btf_type_by_id(targ_btf, targ_id); if (!btf_kind_core_compat(local_type, targ_type)) return 0; recur: depth--; if (depth < 0) return -EINVAL; local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); if (!local_type || !targ_type) return -EINVAL; if (!btf_kind_core_compat(local_type, targ_type)) return 0; switch (btf_kind(local_type)) { case BTF_KIND_UNKN: case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_FWD: case BTF_KIND_ENUM64: return 1; case BTF_KIND_INT: /* just reject deprecated bitfield-like integers; all other * integers are by default compatible between each other */ return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; case BTF_KIND_PTR: local_id = local_type->type; targ_id = targ_type->type; goto recur; case BTF_KIND_ARRAY: local_id = btf_array(local_type)->type; targ_id = btf_array(targ_type)->type; goto recur; case BTF_KIND_FUNC_PROTO: { struct btf_param *local_p = btf_params(local_type); struct btf_param *targ_p = btf_params(targ_type); __u16 local_vlen = btf_vlen(local_type); __u16 targ_vlen = btf_vlen(targ_type); int i, err; if (local_vlen != targ_vlen) return 0; for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { if (level <= 0) return -EINVAL; skip_mods_and_typedefs(local_btf, local_p->type, &local_id); skip_mods_and_typedefs(targ_btf, targ_p->type, &targ_id); err = __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, level - 1); if (err <= 0) return err; } /* tail recurse for return type check */ skip_mods_and_typedefs(local_btf, local_type->type, &local_id); skip_mods_and_typedefs(targ_btf, targ_type->type, &targ_id); goto recur; } default: pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", btf_kind_str(local_type), local_id, targ_id); return 0; } } /* * Turn bpf_core_relo into a low- and high-level spec representation, * validating correctness along the way, as well as calculating resulting * field bit offset, specified by accessor string. Low-level spec captures * every single level of nestedness, including traversing anonymous * struct/union members. High-level one only captures semantically meaningful * "turning points": named fields and array indicies. * E.g., for this case: * * struct sample { * int __unimportant; * struct { * int __1; * int __2; * int a[7]; * }; * }; * * struct sample *s = ...; * * int x = &s->a[3]; // access string = '0:1:2:3' * * Low-level spec has 1:1 mapping with each element of access string (it's * just a parsed access string representation): [0, 1, 2, 3]. * * High-level spec will capture only 3 points: * - initial zero-index access by pointer (&s->... is the same as &s[0]...); * - field 'a' access (corresponds to '2' in low-level spec); * - array element #3 access (corresponds to '3' in low-level spec). * * Type-based relocations (TYPE_EXISTS/TYPE_MATCHES/TYPE_SIZE, * TYPE_ID_LOCAL/TYPE_ID_TARGET) don't capture any field information. Their * spec and raw_spec are kept empty. * * Enum value-based relocations (ENUMVAL_EXISTS/ENUMVAL_VALUE) use access * string to specify enumerator's value index that need to be relocated. */ int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, const struct bpf_core_relo *relo, struct bpf_core_spec *spec) { int access_idx, parsed_len, i; struct bpf_core_accessor *acc; const struct btf_type *t; const char *name, *spec_str; __u32 id, name_off; __s64 sz; spec_str = btf__name_by_offset(btf, relo->access_str_off); if (str_is_empty(spec_str) || *spec_str == ':') return -EINVAL; memset(spec, 0, sizeof(*spec)); spec->btf = btf; spec->root_type_id = relo->type_id; spec->relo_kind = relo->kind; /* type-based relocations don't have a field access string */ if (core_relo_is_type_based(relo->kind)) { if (strcmp(spec_str, "0")) return -EINVAL; return 0; } /* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */ while (*spec_str) { if (*spec_str == ':') ++spec_str; if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1) return -EINVAL; if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) return -E2BIG; spec_str += parsed_len; spec->raw_spec[spec->raw_len++] = access_idx; } if (spec->raw_len == 0) return -EINVAL; t = skip_mods_and_typedefs(btf, relo->type_id, &id); if (!t) return -EINVAL; access_idx = spec->raw_spec[0]; acc = &spec->spec[0]; acc->type_id = id; acc->idx = access_idx; spec->len++; if (core_relo_is_enumval_based(relo->kind)) { if (!btf_is_any_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t)) return -EINVAL; /* record enumerator name in a first accessor */ name_off = btf_is_enum(t) ? btf_enum(t)[access_idx].name_off : btf_enum64(t)[access_idx].name_off; acc->name = btf__name_by_offset(btf, name_off); return 0; } if (!core_relo_is_field_based(relo->kind)) return -EINVAL; sz = btf__resolve_size(btf, id); if (sz < 0) return sz; spec->bit_offset = access_idx * sz * 8; for (i = 1; i < spec->raw_len; i++) { t = skip_mods_and_typedefs(btf, id, &id); if (!t) return -EINVAL; access_idx = spec->raw_spec[i]; acc = &spec->spec[spec->len]; if (btf_is_composite(t)) { const struct btf_member *m; __u32 bit_offset; if (access_idx >= btf_vlen(t)) return -EINVAL; bit_offset = btf_member_bit_offset(t, access_idx); spec->bit_offset += bit_offset; m = btf_members(t) + access_idx; if (m->name_off) { name = btf__name_by_offset(btf, m->name_off); if (str_is_empty(name)) return -EINVAL; acc->type_id = id; acc->idx = access_idx; acc->name = name; spec->len++; } id = m->type; } else if (btf_is_array(t)) { const struct btf_array *a = btf_array(t); bool flex; t = skip_mods_and_typedefs(btf, a->type, &id); if (!t) return -EINVAL; flex = is_flex_arr(btf, acc - 1, a); if (!flex && access_idx >= a->nelems) return -EINVAL; spec->spec[spec->len].type_id = id; spec->spec[spec->len].idx = access_idx; spec->len++; sz = btf__resolve_size(btf, id); if (sz < 0) return sz; spec->bit_offset += access_idx * sz * 8; } else { pr_warn("prog '%s': relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n", prog_name, relo->type_id, spec_str, i, id, btf_kind_str(t)); return -EINVAL; } } return 0; } /* Check two types for compatibility for the purpose of field access * relocation. const/volatile/restrict and typedefs are skipped to ensure we * are relocating semantically compatible entities: * - any two STRUCTs/UNIONs are compatible and can be mixed; * - any two FWDs are compatible, if their names match (modulo flavor suffix); * - any two PTRs are always compatible; * - for ENUMs, names should be the same (ignoring flavor suffix) or at * least one of enums should be anonymous; * - for ENUMs, check sizes, names are ignored; * - for INT, size and signedness are ignored; * - any two FLOATs are always compatible; * - for ARRAY, dimensionality is ignored, element types are checked for * compatibility recursively; * - everything else shouldn't be ever a target of relocation. * These rules are not set in stone and probably will be adjusted as we get * more experience with using BPF CO-RE relocations. */ static int bpf_core_fields_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { const struct btf_type *local_type, *targ_type; recur: local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); if (!local_type || !targ_type) return -EINVAL; if (btf_is_composite(local_type) && btf_is_composite(targ_type)) return 1; if (!btf_kind_core_compat(local_type, targ_type)) return 0; switch (btf_kind(local_type)) { case BTF_KIND_PTR: case BTF_KIND_FLOAT: return 1; case BTF_KIND_FWD: case BTF_KIND_ENUM64: case BTF_KIND_ENUM: { const char *local_name, *targ_name; size_t local_len, targ_len; local_name = btf__name_by_offset(local_btf, local_type->name_off); targ_name = btf__name_by_offset(targ_btf, targ_type->name_off); local_len = bpf_core_essential_name_len(local_name); targ_len = bpf_core_essential_name_len(targ_name); /* one of them is anonymous or both w/ same flavor-less names */ return local_len == 0 || targ_len == 0 || (local_len == targ_len && strncmp(local_name, targ_name, local_len) == 0); } case BTF_KIND_INT: /* just reject deprecated bitfield-like integers; all other * integers are by default compatible between each other */ return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; case BTF_KIND_ARRAY: local_id = btf_array(local_type)->type; targ_id = btf_array(targ_type)->type; goto recur; default: return 0; } } /* * Given single high-level named field accessor in local type, find * corresponding high-level accessor for a target type. Along the way, * maintain low-level spec for target as well. Also keep updating target * bit offset. * * Searching is performed through recursive exhaustive enumeration of all * fields of a struct/union. If there are any anonymous (embedded) * structs/unions, they are recursively searched as well. If field with * desired name is found, check compatibility between local and target types, * before returning result. * * 1 is returned, if field is found. * 0 is returned if no compatible field is found. * <0 is returned on error. */ static int bpf_core_match_member(const struct btf *local_btf, const struct bpf_core_accessor *local_acc, const struct btf *targ_btf, __u32 targ_id, struct bpf_core_spec *spec, __u32 *next_targ_id) { const struct btf_type *local_type, *targ_type; const struct btf_member *local_member, *m; const char *local_name, *targ_name; __u32 local_id; int i, n, found; targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); if (!targ_type) return -EINVAL; if (!btf_is_composite(targ_type)) return 0; local_id = local_acc->type_id; local_type = btf_type_by_id(local_btf, local_id); local_member = btf_members(local_type) + local_acc->idx; local_name = btf__name_by_offset(local_btf, local_member->name_off); n = btf_vlen(targ_type); m = btf_members(targ_type); for (i = 0; i < n; i++, m++) { __u32 bit_offset; bit_offset = btf_member_bit_offset(targ_type, i); /* too deep struct/union/array nesting */ if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) return -E2BIG; /* speculate this member will be the good one */ spec->bit_offset += bit_offset; spec->raw_spec[spec->raw_len++] = i; targ_name = btf__name_by_offset(targ_btf, m->name_off); if (str_is_empty(targ_name)) { /* embedded struct/union, we need to go deeper */ found = bpf_core_match_member(local_btf, local_acc, targ_btf, m->type, spec, next_targ_id); if (found) /* either found or error */ return found; } else if (strcmp(local_name, targ_name) == 0) { /* matching named field */ struct bpf_core_accessor *targ_acc; targ_acc = &spec->spec[spec->len++]; targ_acc->type_id = targ_id; targ_acc->idx = i; targ_acc->name = targ_name; *next_targ_id = m->type; found = bpf_core_fields_are_compat(local_btf, local_member->type, targ_btf, m->type); if (!found) spec->len--; /* pop accessor */ return found; } /* member turned out not to be what we looked for */ spec->bit_offset -= bit_offset; spec->raw_len--; } return 0; } /* * Try to match local spec to a target type and, if successful, produce full * target spec (high-level, low-level + bit offset). */ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, const struct btf *targ_btf, __u32 targ_id, struct bpf_core_spec *targ_spec) { const struct btf_type *targ_type; const struct bpf_core_accessor *local_acc; struct bpf_core_accessor *targ_acc; int i, sz, matched; __u32 name_off; memset(targ_spec, 0, sizeof(*targ_spec)); targ_spec->btf = targ_btf; targ_spec->root_type_id = targ_id; targ_spec->relo_kind = local_spec->relo_kind; if (core_relo_is_type_based(local_spec->relo_kind)) { if (local_spec->relo_kind == BPF_CORE_TYPE_MATCHES) return bpf_core_types_match(local_spec->btf, local_spec->root_type_id, targ_btf, targ_id); else return bpf_core_types_are_compat(local_spec->btf, local_spec->root_type_id, targ_btf, targ_id); } local_acc = &local_spec->spec[0]; targ_acc = &targ_spec->spec[0]; if (core_relo_is_enumval_based(local_spec->relo_kind)) { size_t local_essent_len, targ_essent_len; const char *targ_name; /* has to resolve to an enum */ targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, &targ_id); if (!btf_is_any_enum(targ_type)) return 0; local_essent_len = bpf_core_essential_name_len(local_acc->name); for (i = 0; i < btf_vlen(targ_type); i++) { if (btf_is_enum(targ_type)) name_off = btf_enum(targ_type)[i].name_off; else name_off = btf_enum64(targ_type)[i].name_off; targ_name = btf__name_by_offset(targ_spec->btf, name_off); targ_essent_len = bpf_core_essential_name_len(targ_name); if (targ_essent_len != local_essent_len) continue; if (strncmp(local_acc->name, targ_name, local_essent_len) == 0) { targ_acc->type_id = targ_id; targ_acc->idx = i; targ_acc->name = targ_name; targ_spec->len++; targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; targ_spec->raw_len++; return 1; } } return 0; } if (!core_relo_is_field_based(local_spec->relo_kind)) return -EINVAL; for (i = 0; i < local_spec->len; i++, local_acc++, targ_acc++) { targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, &targ_id); if (!targ_type) return -EINVAL; if (local_acc->name) { matched = bpf_core_match_member(local_spec->btf, local_acc, targ_btf, targ_id, targ_spec, &targ_id); if (matched <= 0) return matched; } else { /* for i=0, targ_id is already treated as array element * type (because it's the original struct), for others * we should find array element type first */ if (i > 0) { const struct btf_array *a; bool flex; if (!btf_is_array(targ_type)) return 0; a = btf_array(targ_type); flex = is_flex_arr(targ_btf, targ_acc - 1, a); if (!flex && local_acc->idx >= a->nelems) return 0; if (!skip_mods_and_typedefs(targ_btf, a->type, &targ_id)) return -EINVAL; } /* too deep struct/union/array nesting */ if (targ_spec->raw_len == BPF_CORE_SPEC_MAX_LEN) return -E2BIG; targ_acc->type_id = targ_id; targ_acc->idx = local_acc->idx; targ_acc->name = NULL; targ_spec->len++; targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; targ_spec->raw_len++; sz = btf__resolve_size(targ_btf, targ_id); if (sz < 0) return sz; targ_spec->bit_offset += local_acc->idx * sz * 8; } } return 1; } static int bpf_core_calc_field_relo(const char *prog_name, const struct bpf_core_relo *relo, const struct bpf_core_spec *spec, __u64 *val, __u32 *field_sz, __u32 *type_id, bool *validate) { const struct bpf_core_accessor *acc; const struct btf_type *t; __u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id, elem_id; const struct btf_member *m; const struct btf_type *mt; bool bitfield; __s64 sz; *field_sz = 0; if (relo->kind == BPF_CORE_FIELD_EXISTS) { *val = spec ? 1 : 0; return 0; } if (!spec) return -EUCLEAN; /* request instruction poisoning */ acc = &spec->spec[spec->len - 1]; t = btf_type_by_id(spec->btf, acc->type_id); /* a[n] accessor needs special handling */ if (!acc->name) { if (relo->kind == BPF_CORE_FIELD_BYTE_OFFSET) { *val = spec->bit_offset / 8; /* remember field size for load/store mem size; * note, for arrays we care about individual element * sizes, not the overall array size */ t = skip_mods_and_typedefs(spec->btf, acc->type_id, &elem_id); while (btf_is_array(t)) t = skip_mods_and_typedefs(spec->btf, btf_array(t)->type, &elem_id); sz = btf__resolve_size(spec->btf, elem_id); if (sz < 0) return -EINVAL; *field_sz = sz; *type_id = acc->type_id; } else if (relo->kind == BPF_CORE_FIELD_BYTE_SIZE) { sz = btf__resolve_size(spec->btf, acc->type_id); if (sz < 0) return -EINVAL; *val = sz; } else { pr_warn("prog '%s': relo %d at insn #%d can't be applied to array access\n", prog_name, relo->kind, relo->insn_off / 8); return -EINVAL; } if (validate) *validate = true; return 0; } m = btf_members(t) + acc->idx; mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id); bit_off = spec->bit_offset; bit_sz = btf_member_bitfield_size(t, acc->idx); bitfield = bit_sz > 0; if (bitfield) { byte_sz = mt->size; byte_off = bit_off / 8 / byte_sz * byte_sz; /* figure out smallest int size necessary for bitfield load */ while (bit_off + bit_sz - byte_off * 8 > byte_sz * 8) { if (byte_sz >= 8) { /* bitfield can't be read with 64-bit read */ pr_warn("prog '%s': relo %d at insn #%d can't be satisfied for bitfield\n", prog_name, relo->kind, relo->insn_off / 8); return -E2BIG; } byte_sz *= 2; byte_off = bit_off / 8 / byte_sz * byte_sz; } } else { sz = btf__resolve_size(spec->btf, field_type_id); if (sz < 0) return -EINVAL; byte_sz = sz; byte_off = spec->bit_offset / 8; bit_sz = byte_sz * 8; } /* for bitfields, all the relocatable aspects are ambiguous and we * might disagree with compiler, so turn off validation of expected * value, except for signedness */ if (validate) *validate = !bitfield; switch (relo->kind) { case BPF_CORE_FIELD_BYTE_OFFSET: *val = byte_off; if (!bitfield) { /* remember field size for load/store mem size; * note, for arrays we care about individual element * sizes, not the overall array size */ t = skip_mods_and_typedefs(spec->btf, field_type_id, &elem_id); while (btf_is_array(t)) t = skip_mods_and_typedefs(spec->btf, btf_array(t)->type, &elem_id); sz = btf__resolve_size(spec->btf, elem_id); if (sz < 0) return -EINVAL; *field_sz = sz; *type_id = field_type_id; } break; case BPF_CORE_FIELD_BYTE_SIZE: *val = byte_sz; break; case BPF_CORE_FIELD_SIGNED: *val = (btf_is_any_enum(mt) && BTF_INFO_KFLAG(mt->info)) || (btf_is_int(mt) && (btf_int_encoding(mt) & BTF_INT_SIGNED)); if (validate) *validate = true; /* signedness is never ambiguous */ break; case BPF_CORE_FIELD_LSHIFT_U64: #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ *val = 64 - (bit_off + bit_sz - byte_off * 8); #else *val = (8 - byte_sz) * 8 + (bit_off - byte_off * 8); #endif break; case BPF_CORE_FIELD_RSHIFT_U64: *val = 64 - bit_sz; if (validate) *validate = true; /* right shift is never ambiguous */ break; case BPF_CORE_FIELD_EXISTS: default: return -EOPNOTSUPP; } return 0; } static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, const struct bpf_core_spec *spec, __u64 *val, bool *validate) { __s64 sz; /* by default, always check expected value in bpf_insn */ if (validate) *validate = true; /* type-based relos return zero when target type is not found */ if (!spec) { *val = 0; return 0; } switch (relo->kind) { case BPF_CORE_TYPE_ID_TARGET: *val = spec->root_type_id; /* type ID, embedded in bpf_insn, might change during linking, * so enforcing it is pointless */ if (validate) *validate = false; break; case BPF_CORE_TYPE_EXISTS: case BPF_CORE_TYPE_MATCHES: *val = 1; break; case BPF_CORE_TYPE_SIZE: sz = btf__resolve_size(spec->btf, spec->root_type_id); if (sz < 0) return -EINVAL; *val = sz; break; case BPF_CORE_TYPE_ID_LOCAL: /* BPF_CORE_TYPE_ID_LOCAL is handled specially and shouldn't get here */ default: return -EOPNOTSUPP; } return 0; } static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, const struct bpf_core_spec *spec, __u64 *val) { const struct btf_type *t; switch (relo->kind) { case BPF_CORE_ENUMVAL_EXISTS: *val = spec ? 1 : 0; break; case BPF_CORE_ENUMVAL_VALUE: if (!spec) return -EUCLEAN; /* request instruction poisoning */ t = btf_type_by_id(spec->btf, spec->spec[0].type_id); if (btf_is_enum(t)) *val = btf_enum(t)[spec->spec[0].idx].val; else *val = btf_enum64_value(btf_enum64(t) + spec->spec[0].idx); break; default: return -EOPNOTSUPP; } return 0; } /* Calculate original and target relocation values, given local and target * specs and relocation kind. These values are calculated for each candidate. * If there are multiple candidates, resulting values should all be consistent * with each other. Otherwise, libbpf will refuse to proceed due to ambiguity. * If instruction has to be poisoned, *poison will be set to true. */ static int bpf_core_calc_relo(const char *prog_name, const struct bpf_core_relo *relo, int relo_idx, const struct bpf_core_spec *local_spec, const struct bpf_core_spec *targ_spec, struct bpf_core_relo_res *res) { int err = -EOPNOTSUPP; res->orig_val = 0; res->new_val = 0; res->poison = false; res->validate = true; res->fail_memsz_adjust = false; res->orig_sz = res->new_sz = 0; res->orig_type_id = res->new_type_id = 0; if (core_relo_is_field_based(relo->kind)) { err = bpf_core_calc_field_relo(prog_name, relo, local_spec, &res->orig_val, &res->orig_sz, &res->orig_type_id, &res->validate); err = err ?: bpf_core_calc_field_relo(prog_name, relo, targ_spec, &res->new_val, &res->new_sz, &res->new_type_id, NULL); if (err) goto done; /* Validate if it's safe to adjust load/store memory size. * Adjustments are performed only if original and new memory * sizes differ. */ res->fail_memsz_adjust = false; if (res->orig_sz != res->new_sz) { const struct btf_type *orig_t, *new_t; orig_t = btf_type_by_id(local_spec->btf, res->orig_type_id); new_t = btf_type_by_id(targ_spec->btf, res->new_type_id); /* There are two use cases in which it's safe to * adjust load/store's mem size: * - reading a 32-bit kernel pointer, while on BPF * size pointers are always 64-bit; in this case * it's safe to "downsize" instruction size due to * pointer being treated as unsigned integer with * zero-extended upper 32-bits; * - reading unsigned integers, again due to * zero-extension is preserving the value correctly. * * In all other cases it's incorrect to attempt to * load/store field because read value will be * incorrect, so we poison relocated instruction. */ if (btf_is_ptr(orig_t) && btf_is_ptr(new_t)) goto done; if (btf_is_int(orig_t) && btf_is_int(new_t) && btf_int_encoding(orig_t) != BTF_INT_SIGNED && btf_int_encoding(new_t) != BTF_INT_SIGNED) goto done; /* mark as invalid mem size adjustment, but this will * only be checked for LDX/STX/ST insns */ res->fail_memsz_adjust = true; } } else if (core_relo_is_type_based(relo->kind)) { err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val, &res->validate); err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val, NULL); } else if (core_relo_is_enumval_based(relo->kind)) { err = bpf_core_calc_enumval_relo(relo, local_spec, &res->orig_val); err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val); } done: if (err == -EUCLEAN) { /* EUCLEAN is used to signal instruction poisoning request */ res->poison = true; err = 0; } else if (err == -EOPNOTSUPP) { /* EOPNOTSUPP means unknown/unsupported relocation */ pr_warn("prog '%s': relo #%d: unrecognized CO-RE relocation %s (%d) at insn #%d\n", prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind, relo->insn_off / 8); } return err; } /* * Turn instruction for which CO_RE relocation failed into invalid one with * distinct signature. */ static void bpf_core_poison_insn(const char *prog_name, int relo_idx, int insn_idx, struct bpf_insn *insn) { pr_debug("prog '%s': relo #%d: substituting insn #%d w/ invalid insn\n", prog_name, relo_idx, insn_idx); insn->code = BPF_JMP | BPF_CALL; insn->dst_reg = 0; insn->src_reg = 0; insn->off = 0; /* if this instruction is reachable (not a dead code), * verifier will complain with the following message: * invalid func unknown#195896080 */ insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */ } static int insn_bpf_size_to_bytes(struct bpf_insn *insn) { switch (BPF_SIZE(insn->code)) { case BPF_DW: return 8; case BPF_W: return 4; case BPF_H: return 2; case BPF_B: return 1; default: return -1; } } static int insn_bytes_to_bpf_size(__u32 sz) { switch (sz) { case 8: return BPF_DW; case 4: return BPF_W; case 2: return BPF_H; case 1: return BPF_B; default: return -1; } } /* * Patch relocatable BPF instruction. * * Patched value is determined by relocation kind and target specification. * For existence relocations target spec will be NULL if field/type is not found. * Expected insn->imm value is determined using relocation kind and local * spec, and is checked before patching instruction. If actual insn->imm value * is wrong, bail out with error. * * Currently supported classes of BPF instruction are: * 1. rX = <imm> (assignment with immediate operand); * 2. rX += <imm> (arithmetic operations with immediate operand); * 3. rX = <imm64> (load with 64-bit immediate value); * 4. rX = *(T *)(rY + <off>), where T is one of {u8, u16, u32, u64}; * 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64}; * 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}. */ int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, int insn_idx, const struct bpf_core_relo *relo, int relo_idx, const struct bpf_core_relo_res *res) { __u64 orig_val, new_val; __u8 class; class = BPF_CLASS(insn->code); if (res->poison) { poison: /* poison second part of ldimm64 to avoid confusing error from * verifier about "unknown opcode 00" */ if (is_ldimm64_insn(insn)) bpf_core_poison_insn(prog_name, relo_idx, insn_idx + 1, insn + 1); bpf_core_poison_insn(prog_name, relo_idx, insn_idx, insn); return 0; } orig_val = res->orig_val; new_val = res->new_val; switch (class) { case BPF_ALU: case BPF_ALU64: if (BPF_SRC(insn->code) != BPF_K) return -EINVAL; if (res->validate && insn->imm != orig_val) { pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %llu -> %llu\n", prog_name, relo_idx, insn_idx, insn->imm, (unsigned long long)orig_val, (unsigned long long)new_val); return -EINVAL; } orig_val = insn->imm; insn->imm = new_val; pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %llu -> %llu\n", prog_name, relo_idx, insn_idx, (unsigned long long)orig_val, (unsigned long long)new_val); break; case BPF_LDX: case BPF_ST: case BPF_STX: if (res->validate && insn->off != orig_val) { pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %llu -> %llu\n", prog_name, relo_idx, insn_idx, insn->off, (unsigned long long)orig_val, (unsigned long long)new_val); return -EINVAL; } if (new_val > SHRT_MAX) { pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %llu\n", prog_name, relo_idx, insn_idx, (unsigned long long)new_val); return -ERANGE; } if (res->fail_memsz_adjust) { pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. " "Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n", prog_name, relo_idx, insn_idx); goto poison; } orig_val = insn->off; insn->off = new_val; pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %llu -> %llu\n", prog_name, relo_idx, insn_idx, (unsigned long long)orig_val, (unsigned long long)new_val); if (res->new_sz != res->orig_sz) { int insn_bytes_sz, insn_bpf_sz; insn_bytes_sz = insn_bpf_size_to_bytes(insn); if (insn_bytes_sz != res->orig_sz) { pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n", prog_name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz); return -EINVAL; } insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz); if (insn_bpf_sz < 0) { pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n", prog_name, relo_idx, insn_idx, res->new_sz); return -EINVAL; } insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code); pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n", prog_name, relo_idx, insn_idx, res->orig_sz, res->new_sz); } break; case BPF_LD: { __u64 imm; if (!is_ldimm64_insn(insn) || insn[0].src_reg != 0 || insn[0].off != 0 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || insn[1].off != 0) { pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n", prog_name, relo_idx, insn_idx); return -EINVAL; } imm = (__u32)insn[0].imm | ((__u64)insn[1].imm << 32); if (res->validate && imm != orig_val) { pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %llu -> %llu\n", prog_name, relo_idx, insn_idx, (unsigned long long)imm, (unsigned long long)orig_val, (unsigned long long)new_val); return -EINVAL; } insn[0].imm = new_val; insn[1].imm = new_val >> 32; pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %llu\n", prog_name, relo_idx, insn_idx, (unsigned long long)imm, (unsigned long long)new_val); break; } default: pr_warn("prog '%s': relo #%d: trying to relocate unrecognized insn #%d, code:0x%x, src:0x%x, dst:0x%x, off:0x%x, imm:0x%x\n", prog_name, relo_idx, insn_idx, insn->code, insn->src_reg, insn->dst_reg, insn->off, insn->imm); return -EINVAL; } return 0; } /* Output spec definition in the format: * [<type-id>] (<type-name>) + <raw-spec> => <offset>@<spec>, * where <spec> is a C-syntax view of recorded field access, e.g.: x.a[3].b */ int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *spec) { const struct btf_type *t; const char *s; __u32 type_id; int i, len = 0; #define append_buf(fmt, args...) \ ({ \ int r; \ r = snprintf(buf, buf_sz, fmt, ##args); \ len += r; \ if (r >= buf_sz) \ r = buf_sz; \ buf += r; \ buf_sz -= r; \ }) type_id = spec->root_type_id; t = btf_type_by_id(spec->btf, type_id); s = btf__name_by_offset(spec->btf, t->name_off); append_buf("<%s> [%u] %s %s", core_relo_kind_str(spec->relo_kind), type_id, btf_kind_str(t), str_is_empty(s) ? "<anon>" : s); if (core_relo_is_type_based(spec->relo_kind)) return len; if (core_relo_is_enumval_based(spec->relo_kind)) { t = skip_mods_and_typedefs(spec->btf, type_id, NULL); if (btf_is_enum(t)) { const struct btf_enum *e; const char *fmt_str; e = btf_enum(t) + spec->raw_spec[0]; s = btf__name_by_offset(spec->btf, e->name_off); fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %d" : "::%s = %u"; append_buf(fmt_str, s, e->val); } else { const struct btf_enum64 *e; const char *fmt_str; e = btf_enum64(t) + spec->raw_spec[0]; s = btf__name_by_offset(spec->btf, e->name_off); fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %lld" : "::%s = %llu"; append_buf(fmt_str, s, (unsigned long long)btf_enum64_value(e)); } return len; } if (core_relo_is_field_based(spec->relo_kind)) { for (i = 0; i < spec->len; i++) { if (spec->spec[i].name) append_buf(".%s", spec->spec[i].name); else if (i > 0 || spec->spec[i].idx > 0) append_buf("[%u]", spec->spec[i].idx); } append_buf(" ("); for (i = 0; i < spec->raw_len; i++) append_buf("%s%d", i == 0 ? "" : ":", spec->raw_spec[i]); if (spec->bit_offset % 8) append_buf(" @ offset %u.%u)", spec->bit_offset / 8, spec->bit_offset % 8); else append_buf(" @ offset %u)", spec->bit_offset / 8); return len; } return len; #undef append_buf } /* * Calculate CO-RE relocation target result. * * The outline and important points of the algorithm: * 1. For given local type, find corresponding candidate target types. * Candidate type is a type with the same "essential" name, ignoring * everything after last triple underscore (___). E.g., `sample`, * `sample___flavor_one`, `sample___flavor_another_one`, are all candidates * for each other. Names with triple underscore are referred to as * "flavors" and are useful, among other things, to allow to * specify/support incompatible variations of the same kernel struct, which * might differ between different kernel versions and/or build * configurations. * * N.B. Struct "flavors" could be generated by bpftool's BTF-to-C * converter, when deduplicated BTF of a kernel still contains more than * one different types with the same name. In that case, ___2, ___3, etc * are appended starting from second name conflict. But start flavors are * also useful to be defined "locally", in BPF program, to extract same * data from incompatible changes between different kernel * versions/configurations. For instance, to handle field renames between * kernel versions, one can use two flavors of the struct name with the * same common name and use conditional relocations to extract that field, * depending on target kernel version. * 2. For each candidate type, try to match local specification to this * candidate target type. Matching involves finding corresponding * high-level spec accessors, meaning that all named fields should match, * as well as all array accesses should be within the actual bounds. Also, * types should be compatible (see bpf_core_fields_are_compat for details). * 3. It is supported and expected that there might be multiple flavors * matching the spec. As long as all the specs resolve to the same set of * offsets across all candidates, there is no error. If there is any * ambiguity, CO-RE relocation will fail. This is necessary to accommodate * imperfection of BTF deduplication, which can cause slight duplication of * the same BTF type, if some directly or indirectly referenced (by * pointer) type gets resolved to different actual types in different * object files. If such a situation occurs, deduplicated BTF will end up * with two (or more) structurally identical types, which differ only in * types they refer to through pointer. This should be OK in most cases and * is not an error. * 4. Candidate types search is performed by linearly scanning through all * types in target BTF. It is anticipated that this is overall more * efficient memory-wise and not significantly worse (if not better) * CPU-wise compared to prebuilding a map from all local type names to * a list of candidate type names. It's also sped up by caching resolved * list of matching candidates per each local "root" type ID, that has at * least one bpf_core_relo associated with it. This list is shared * between multiple relocations for the same type ID and is updated as some * of the candidates are pruned due to structural incompatibility. */ int bpf_core_calc_relo_insn(const char *prog_name, const struct bpf_core_relo *relo, int relo_idx, const struct btf *local_btf, struct bpf_core_cand_list *cands, struct bpf_core_spec *specs_scratch, struct bpf_core_relo_res *targ_res) { struct bpf_core_spec *local_spec = &specs_scratch[0]; struct bpf_core_spec *cand_spec = &specs_scratch[1]; struct bpf_core_spec *targ_spec = &specs_scratch[2]; struct bpf_core_relo_res cand_res; const struct btf_type *local_type; const char *local_name; __u32 local_id; char spec_buf[256]; int i, j, err; local_id = relo->type_id; local_type = btf_type_by_id(local_btf, local_id); local_name = btf__name_by_offset(local_btf, local_type->name_off); if (!local_name) return -EINVAL; err = bpf_core_parse_spec(prog_name, local_btf, relo, local_spec); if (err) { const char *spec_str; spec_str = btf__name_by_offset(local_btf, relo->access_str_off); pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", prog_name, relo_idx, local_id, btf_kind_str(local_type), str_is_empty(local_name) ? "<anon>" : local_name, spec_str ?: "<?>", err); return -EINVAL; } bpf_core_format_spec(spec_buf, sizeof(spec_buf), local_spec); pr_debug("prog '%s': relo #%d: %s\n", prog_name, relo_idx, spec_buf); /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) { /* bpf_insn's imm value could get out of sync during linking */ memset(targ_res, 0, sizeof(*targ_res)); targ_res->validate = false; targ_res->poison = false; targ_res->orig_val = local_spec->root_type_id; targ_res->new_val = local_spec->root_type_id; return 0; } /* libbpf doesn't support candidate search for anonymous types */ if (str_is_empty(local_name)) { pr_warn("prog '%s': relo #%d: <%s> (%d) relocation doesn't support anonymous types\n", prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); return -EOPNOTSUPP; } for (i = 0, j = 0; i < cands->len; i++) { err = bpf_core_spec_match(local_spec, cands->cands[i].btf, cands->cands[i].id, cand_spec); if (err < 0) { bpf_core_format_spec(spec_buf, sizeof(spec_buf), cand_spec); pr_warn("prog '%s': relo #%d: error matching candidate #%d %s: %d\n", prog_name, relo_idx, i, spec_buf, err); return err; } bpf_core_format_spec(spec_buf, sizeof(spec_buf), cand_spec); pr_debug("prog '%s': relo #%d: %s candidate #%d %s\n", prog_name, relo_idx, err == 0 ? "non-matching" : "matching", i, spec_buf); if (err == 0) continue; err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, cand_spec, &cand_res); if (err) return err; if (j == 0) { *targ_res = cand_res; *targ_spec = *cand_spec; } else if (cand_spec->bit_offset != targ_spec->bit_offset) { /* if there are many field relo candidates, they * should all resolve to the same bit offset */ pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n", prog_name, relo_idx, cand_spec->bit_offset, targ_spec->bit_offset); return -EINVAL; } else if (cand_res.poison != targ_res->poison || cand_res.new_val != targ_res->new_val) { /* all candidates should result in the same relocation * decision and value, otherwise it's dangerous to * proceed due to ambiguity */ pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %llu != %s %llu\n", prog_name, relo_idx, cand_res.poison ? "failure" : "success", (unsigned long long)cand_res.new_val, targ_res->poison ? "failure" : "success", (unsigned long long)targ_res->new_val); return -EINVAL; } cands->cands[j++] = cands->cands[i]; } /* * For BPF_CORE_FIELD_EXISTS relo or when used BPF program has field * existence checks or kernel version/config checks, it's expected * that we might not find any candidates. In this case, if field * wasn't found in any candidate, the list of candidates shouldn't * change at all, we'll just handle relocating appropriately, * depending on relo's kind. */ if (j > 0) cands->len = j; /* * If no candidates were found, it might be both a programmer error, * as well as expected case, depending whether instruction w/ * relocation is guarded in some way that makes it unreachable (dead * code) if relocation can't be resolved. This is handled in * bpf_core_patch_insn() uniformly by replacing that instruction with * BPF helper call insn (using invalid helper ID). If that instruction * is indeed unreachable, then it will be ignored and eliminated by * verifier. If it was an error, then verifier will complain and point * to a specific instruction number in its log. */ if (j == 0) { pr_debug("prog '%s': relo #%d: no matching targets found\n", prog_name, relo_idx); /* calculate single target relo result explicitly */ err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, NULL, targ_res); if (err) return err; } return 0; } static bool bpf_core_names_match(const struct btf *local_btf, size_t local_name_off, const struct btf *targ_btf, size_t targ_name_off) { const char *local_n, *targ_n; size_t local_len, targ_len; local_n = btf__name_by_offset(local_btf, local_name_off); targ_n = btf__name_by_offset(targ_btf, targ_name_off); if (str_is_empty(targ_n)) return str_is_empty(local_n); targ_len = bpf_core_essential_name_len(targ_n); local_len = bpf_core_essential_name_len(local_n); return targ_len == local_len && strncmp(local_n, targ_n, local_len) == 0; } static int bpf_core_enums_match(const struct btf *local_btf, const struct btf_type *local_t, const struct btf *targ_btf, const struct btf_type *targ_t) { __u16 local_vlen = btf_vlen(local_t); __u16 targ_vlen = btf_vlen(targ_t); int i, j; if (local_t->size != targ_t->size) return 0; if (local_vlen > targ_vlen) return 0; /* iterate over the local enum's variants and make sure each has * a symbolic name correspondent in the target */ for (i = 0; i < local_vlen; i++) { bool matched = false; __u32 local_n_off, targ_n_off; local_n_off = btf_is_enum(local_t) ? btf_enum(local_t)[i].name_off : btf_enum64(local_t)[i].name_off; for (j = 0; j < targ_vlen; j++) { targ_n_off = btf_is_enum(targ_t) ? btf_enum(targ_t)[j].name_off : btf_enum64(targ_t)[j].name_off; if (bpf_core_names_match(local_btf, local_n_off, targ_btf, targ_n_off)) { matched = true; break; } } if (!matched) return 0; } return 1; } static int bpf_core_composites_match(const struct btf *local_btf, const struct btf_type *local_t, const struct btf *targ_btf, const struct btf_type *targ_t, bool behind_ptr, int level) { const struct btf_member *local_m = btf_members(local_t); __u16 local_vlen = btf_vlen(local_t); __u16 targ_vlen = btf_vlen(targ_t); int i, j, err; if (local_vlen > targ_vlen) return 0; /* check that all local members have a match in the target */ for (i = 0; i < local_vlen; i++, local_m++) { const struct btf_member *targ_m = btf_members(targ_t); bool matched = false; for (j = 0; j < targ_vlen; j++, targ_m++) { if (!bpf_core_names_match(local_btf, local_m->name_off, targ_btf, targ_m->name_off)) continue; err = __bpf_core_types_match(local_btf, local_m->type, targ_btf, targ_m->type, behind_ptr, level - 1); if (err < 0) return err; if (err > 0) { matched = true; break; } } if (!matched) return 0; } return 1; } /* Check that two types "match". This function assumes that root types were * already checked for name match. * * The matching relation is defined as follows: * - modifiers and typedefs are stripped (and, hence, effectively ignored) * - generally speaking types need to be of same kind (struct vs. struct, union * vs. union, etc.) * - exceptions are struct/union behind a pointer which could also match a * forward declaration of a struct or union, respectively, and enum vs. * enum64 (see below) * Then, depending on type: * - integers: * - match if size and signedness match * - arrays & pointers: * - target types are recursively matched * - structs & unions: * - local members need to exist in target with the same name * - for each member we recursively check match unless it is already behind a * pointer, in which case we only check matching names and compatible kind * - enums: * - local variants have to have a match in target by symbolic name (but not * numeric value) * - size has to match (but enum may match enum64 and vice versa) * - function pointers: * - number and position of arguments in local type has to match target * - for each argument and the return value we recursively check match */ int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id, bool behind_ptr, int level) { const struct btf_type *local_t, *targ_t; int depth = 32; /* max recursion depth */ __u16 local_k, targ_k; if (level <= 0) return -EINVAL; recur: depth--; if (depth < 0) return -EINVAL; local_t = skip_mods_and_typedefs(local_btf, local_id, &local_id); targ_t = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); if (!local_t || !targ_t) return -EINVAL; /* While the name check happens after typedefs are skipped, root-level * typedefs would still be name-matched as that's the contract with * callers. */ if (!bpf_core_names_match(local_btf, local_t->name_off, targ_btf, targ_t->name_off)) return 0; local_k = btf_kind(local_t); targ_k = btf_kind(targ_t); switch (local_k) { case BTF_KIND_UNKN: return local_k == targ_k; case BTF_KIND_FWD: { bool local_f = BTF_INFO_KFLAG(local_t->info); if (behind_ptr) { if (local_k == targ_k) return local_f == BTF_INFO_KFLAG(targ_t->info); /* for forward declarations kflag dictates whether the * target is a struct (0) or union (1) */ return (targ_k == BTF_KIND_STRUCT && !local_f) || (targ_k == BTF_KIND_UNION && local_f); } else { if (local_k != targ_k) return 0; /* match if the forward declaration is for the same kind */ return local_f == BTF_INFO_KFLAG(targ_t->info); } } case BTF_KIND_ENUM: case BTF_KIND_ENUM64: if (!btf_is_any_enum(targ_t)) return 0; return bpf_core_enums_match(local_btf, local_t, targ_btf, targ_t); case BTF_KIND_STRUCT: case BTF_KIND_UNION: if (behind_ptr) { bool targ_f = BTF_INFO_KFLAG(targ_t->info); if (local_k == targ_k) return 1; if (targ_k != BTF_KIND_FWD) return 0; return (local_k == BTF_KIND_UNION) == targ_f; } else { if (local_k != targ_k) return 0; return bpf_core_composites_match(local_btf, local_t, targ_btf, targ_t, behind_ptr, level); } case BTF_KIND_INT: { __u8 local_sgn; __u8 targ_sgn; if (local_k != targ_k) return 0; local_sgn = btf_int_encoding(local_t) & BTF_INT_SIGNED; targ_sgn = btf_int_encoding(targ_t) & BTF_INT_SIGNED; return local_t->size == targ_t->size && local_sgn == targ_sgn; } case BTF_KIND_PTR: if (local_k != targ_k) return 0; behind_ptr = true; local_id = local_t->type; targ_id = targ_t->type; goto recur; case BTF_KIND_ARRAY: { const struct btf_array *local_array = btf_array(local_t); const struct btf_array *targ_array = btf_array(targ_t); if (local_k != targ_k) return 0; if (local_array->nelems != targ_array->nelems) return 0; local_id = local_array->type; targ_id = targ_array->type; goto recur; } case BTF_KIND_FUNC_PROTO: { struct btf_param *local_p = btf_params(local_t); struct btf_param *targ_p = btf_params(targ_t); __u16 local_vlen = btf_vlen(local_t); __u16 targ_vlen = btf_vlen(targ_t); int i, err; if (local_k != targ_k) return 0; if (local_vlen != targ_vlen) return 0; for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { err = __bpf_core_types_match(local_btf, local_p->type, targ_btf, targ_p->type, behind_ptr, level - 1); if (err <= 0) return err; } /* tail recurse for return type check */ local_id = local_t->type; targ_id = targ_t->type; goto recur; } default: pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", btf_kind_str(local_t), local_id, targ_id); return 0; } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DELAY_H #define _LINUX_DELAY_H /* * Copyright (C) 1993 Linus Torvalds * * Delay routines, using a pre-computed "loops_per_jiffy" value. * Sleep routines using timer list timers or hrtimers. */ #include <linux/math.h> #include <linux/sched.h> #include <linux/jiffies.h> extern unsigned long loops_per_jiffy; #include <asm/delay.h> /* * Using udelay() for intervals greater than a few milliseconds can * risk overflow for high loops_per_jiffy (high bogomips) machines. The * mdelay() provides a wrapper to prevent this. For delays greater * than MAX_UDELAY_MS milliseconds, the wrapper is used. Architecture * specific values can be defined in asm-???/delay.h as an override. * The 2nd mdelay() definition ensures GCC will optimize away the * while loop for the common cases where n <= MAX_UDELAY_MS -- Paul G. */ #ifndef MAX_UDELAY_MS #define MAX_UDELAY_MS 5 #endif #ifndef mdelay /** * mdelay - Inserting a delay based on milliseconds with busy waiting * @n: requested delay in milliseconds * * See udelay() for basic information about mdelay() and it's variants. * * Please double check, whether mdelay() is the right way to go or whether a * refactoring of the code is the better variant to be able to use msleep() * instead. */ #define mdelay(n) (\ (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \ ({unsigned long __ms=(n); while (__ms--) udelay(1000);})) #endif #ifndef ndelay static inline void ndelay(unsigned long x) { udelay(DIV_ROUND_UP(x, 1000)); } #define ndelay(x) ndelay(x) #endif extern unsigned long lpj_fine; void calibrate_delay(void); unsigned long calibrate_delay_is_known(void); void __attribute__((weak)) calibration_delay_done(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); void usleep_range_state(unsigned long min, unsigned long max, unsigned int state); /** * usleep_range - Sleep for an approximate time * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * * For basic information please refer to usleep_range_state(). * * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep. */ static inline void usleep_range(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); } /** * usleep_range_idle - Sleep for an approximate time with idle time accounting * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * * For basic information please refer to usleep_range_state(). * * The sleeping task has the state TASK_IDLE during the sleep to prevent * contribution to the load average. */ static inline void usleep_range_idle(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_IDLE); } /** * ssleep - wrapper for seconds around msleep * @seconds: Requested sleep duration in seconds * * Please refer to msleep() for detailed information. */ static inline void ssleep(unsigned int seconds) { msleep(seconds * 1000); } static const unsigned int max_slack_shift = 2; #define USLEEP_RANGE_UPPER_BOUND ((TICK_NSEC << max_slack_shift) / NSEC_PER_USEC) /** * fsleep - flexible sleep which autoselects the best mechanism * @usecs: requested sleep duration in microseconds * * flseep() selects the best mechanism that will provide maximum 25% slack * to the requested sleep duration. Therefore it uses: * * * udelay() loop for sleep durations <= 10 microseconds to avoid hrtimer * overhead for really short sleep durations. * * usleep_range() for sleep durations which would lead with the usage of * msleep() to a slack larger than 25%. This depends on the granularity of * jiffies. * * msleep() for all other sleep durations. * * Note: When %CONFIG_HIGH_RES_TIMERS is not set, all sleeps are processed with * the granularity of jiffies and the slack might exceed 25% especially for * short sleep durations. */ static inline void fsleep(unsigned long usecs) { if (usecs <= 10) udelay(usecs); else if (usecs < USLEEP_RANGE_UPPER_BOUND) usleep_range(usecs, usecs + (usecs >> max_slack_shift)); else msleep(DIV_ROUND_UP(usecs, USEC_PER_MSEC)); } #endif /* defined(_LINUX_DELAY_H) */
2 2 1 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/em_ipset.c ipset ematch * * Copyright (c) 2012 Florian Westphal <fw@strlen.de> */ #include <linux/gfp.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/netfilter/xt_set.h> #include <linux/ipv6.h> #include <net/ip.h> #include <net/pkt_cls.h> static int em_ipset_change(struct net *net, void *data, int data_len, struct tcf_ematch *em) { struct xt_set_info *set = data; ip_set_id_t index; if (data_len != sizeof(*set)) return -EINVAL; index = ip_set_nfnl_get_byindex(net, set->index); if (index == IPSET_INVALID_ID) return -ENOENT; em->datalen = sizeof(*set); em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL); if (em->data) return 0; ip_set_nfnl_put(net, index); return -ENOMEM; } static void em_ipset_destroy(struct tcf_ematch *em) { const struct xt_set_info *set = (const void *) em->data; if (set) { ip_set_nfnl_put(em->net, set->index); kfree((void *) em->data); } } static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { struct ip_set_adt_opt opt; struct xt_action_param acpar; const struct xt_set_info *set = (const void *) em->data; struct net_device *dev, *indev = NULL; struct nf_hook_state state = { .net = em->net, }; int ret, network_offset; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): state.pf = NFPROTO_IPV4; if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) return 0; acpar.thoff = ip_hdrlen(skb); break; case htons(ETH_P_IPV6): state.pf = NFPROTO_IPV6; if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) return 0; /* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */ acpar.thoff = sizeof(struct ipv6hdr); break; default: return 0; } opt.family = state.pf; opt.dim = set->dim; opt.flags = set->flags; opt.cmdflags = 0; opt.ext.timeout = ~0u; network_offset = skb_network_offset(skb); skb_pull(skb, network_offset); dev = skb->dev; rcu_read_lock(); if (skb->skb_iif) indev = dev_get_by_index_rcu(em->net, skb->skb_iif); state.in = indev ? indev : dev; state.out = dev; acpar.state = &state; ret = ip_set_test(set->index, skb, &acpar, &opt); rcu_read_unlock(); skb_push(skb, network_offset); return ret; } static struct tcf_ematch_ops em_ipset_ops = { .kind = TCF_EM_IPSET, .change = em_ipset_change, .destroy = em_ipset_destroy, .match = em_ipset_match, .owner = THIS_MODULE, .link = LIST_HEAD_INIT(em_ipset_ops.link) }; static int __init init_em_ipset(void) { return tcf_em_register(&em_ipset_ops); } static void __exit exit_em_ipset(void) { tcf_em_unregister(&em_ipset_ops); } MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_DESCRIPTION("TC extended match for IP sets"); module_init(init_em_ipset); module_exit(exit_em_ipset); MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET);
91 227 4 222 222 206 206 93 93 35 57 36 57 57 91 93 91 7 88 93 93 92 92 92 93 88 89 88 89 88 89 11 11 11 11 11 11 213 212 167 67 64 2 62 62 67 67 7 6 1 5 5 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 // SPDX-License-Identifier: GPL-2.0-only /* * GCM: Galois/Counter Mode. * * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> */ #include <crypto/gf128mul.h> #include <crypto/internal/aead.h> #include <crypto/internal/skcipher.h> #include <crypto/internal/hash.h> #include <crypto/scatterwalk.h> #include <crypto/gcm.h> #include <crypto/hash.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> struct gcm_instance_ctx { struct crypto_skcipher_spawn ctr; struct crypto_ahash_spawn ghash; }; struct crypto_gcm_ctx { struct crypto_skcipher *ctr; struct crypto_ahash *ghash; }; struct crypto_rfc4106_ctx { struct crypto_aead *child; u8 nonce[4]; }; struct crypto_rfc4106_req_ctx { struct scatterlist src[3]; struct scatterlist dst[3]; struct aead_request subreq; }; struct crypto_rfc4543_instance_ctx { struct crypto_aead_spawn aead; }; struct crypto_rfc4543_ctx { struct crypto_aead *child; u8 nonce[4]; }; struct crypto_rfc4543_req_ctx { struct aead_request subreq; }; struct crypto_gcm_ghash_ctx { unsigned int cryptlen; struct scatterlist *src; int (*complete)(struct aead_request *req, u32 flags); }; struct crypto_gcm_req_priv_ctx { u8 iv[16]; u8 auth_tag[16]; u8 iauth_tag[16]; struct scatterlist src[3]; struct scatterlist dst[3]; struct scatterlist sg; struct crypto_gcm_ghash_ctx ghash_ctx; union { struct ahash_request ahreq; struct skcipher_request skreq; } u; }; static struct { u8 buf[16]; struct scatterlist sg; } *gcm_zeroes; static inline struct crypto_gcm_req_priv_ctx *crypto_gcm_reqctx( struct aead_request *req) { unsigned long align = crypto_aead_alignmask(crypto_aead_reqtfm(req)); return (void *)PTR_ALIGN((u8 *)aead_request_ctx(req), align + 1); } static int crypto_gcm_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct crypto_gcm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_ahash *ghash = ctx->ghash; struct crypto_skcipher *ctr = ctx->ctr; struct { be128 hash; u8 iv[16]; struct crypto_wait wait; struct scatterlist sg[1]; struct skcipher_request req; } *data; int err; crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(ctr, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(ctr, key, keylen); if (err) return err; data = kzalloc(sizeof(*data) + crypto_skcipher_reqsize(ctr), GFP_KERNEL); if (!data) return -ENOMEM; crypto_init_wait(&data->wait); sg_init_one(data->sg, &data->hash, sizeof(data->hash)); skcipher_request_set_tfm(&data->req, ctr); skcipher_request_set_callback(&data->req, CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &data->wait); skcipher_request_set_crypt(&data->req, data->sg, data->sg, sizeof(data->hash), data->iv); err = crypto_wait_req(crypto_skcipher_encrypt(&data->req), &data->wait); if (err) goto out; crypto_ahash_clear_flags(ghash, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(ghash, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); err = crypto_ahash_setkey(ghash, (u8 *)&data->hash, sizeof(be128)); out: kfree_sensitive(data); return err; } static int crypto_gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { return crypto_gcm_check_authsize(authsize); } static void crypto_gcm_init_common(struct aead_request *req) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); __be32 counter = cpu_to_be32(1); struct scatterlist *sg; memset(pctx->auth_tag, 0, sizeof(pctx->auth_tag)); memcpy(pctx->iv, req->iv, GCM_AES_IV_SIZE); memcpy(pctx->iv + GCM_AES_IV_SIZE, &counter, 4); sg_init_table(pctx->src, 3); sg_set_buf(pctx->src, pctx->auth_tag, sizeof(pctx->auth_tag)); sg = scatterwalk_ffwd(pctx->src + 1, req->src, req->assoclen); if (sg != pctx->src + 1) sg_chain(pctx->src, 2, sg); if (req->src != req->dst) { sg_init_table(pctx->dst, 3); sg_set_buf(pctx->dst, pctx->auth_tag, sizeof(pctx->auth_tag)); sg = scatterwalk_ffwd(pctx->dst + 1, req->dst, req->assoclen); if (sg != pctx->dst + 1) sg_chain(pctx->dst, 2, sg); } } static void crypto_gcm_init_crypt(struct aead_request *req, unsigned int cryptlen) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_gcm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct skcipher_request *skreq = &pctx->u.skreq; struct scatterlist *dst; dst = req->src == req->dst ? pctx->src : pctx->dst; skcipher_request_set_tfm(skreq, ctx->ctr); skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen + sizeof(pctx->auth_tag), pctx->iv); } static inline unsigned int gcm_remain(unsigned int len) { len &= 0xfU; return len ? 16 - len : 0; } static void gcm_hash_len_done(void *data, int err); static int gcm_hash_update(struct aead_request *req, crypto_completion_t compl, struct scatterlist *src, unsigned int len, u32 flags) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct ahash_request *ahreq = &pctx->u.ahreq; ahash_request_set_callback(ahreq, flags, compl, req); ahash_request_set_crypt(ahreq, src, NULL, len); return crypto_ahash_update(ahreq); } static int gcm_hash_remain(struct aead_request *req, unsigned int remain, crypto_completion_t compl, u32 flags) { return gcm_hash_update(req, compl, &gcm_zeroes->sg, remain, flags); } static int gcm_hash_len(struct aead_request *req, u32 flags) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct ahash_request *ahreq = &pctx->u.ahreq; struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx; be128 lengths; lengths.a = cpu_to_be64(req->assoclen * 8); lengths.b = cpu_to_be64(gctx->cryptlen * 8); memcpy(pctx->iauth_tag, &lengths, 16); sg_init_one(&pctx->sg, pctx->iauth_tag, 16); ahash_request_set_callback(ahreq, flags, gcm_hash_len_done, req); ahash_request_set_crypt(ahreq, &pctx->sg, pctx->iauth_tag, sizeof(lengths)); return crypto_ahash_finup(ahreq); } static int gcm_hash_len_continue(struct aead_request *req, u32 flags) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx; return gctx->complete(req, flags); } static void gcm_hash_len_done(void *data, int err) { struct aead_request *req = data; if (err) goto out; err = gcm_hash_len_continue(req, 0); if (err == -EINPROGRESS) return; out: aead_request_complete(req, err); } static int gcm_hash_crypt_remain_continue(struct aead_request *req, u32 flags) { return gcm_hash_len(req, flags) ?: gcm_hash_len_continue(req, flags); } static void gcm_hash_crypt_remain_done(void *data, int err) { struct aead_request *req = data; if (err) goto out; err = gcm_hash_crypt_remain_continue(req, 0); if (err == -EINPROGRESS) return; out: aead_request_complete(req, err); } static int gcm_hash_crypt_continue(struct aead_request *req, u32 flags) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx; unsigned int remain; remain = gcm_remain(gctx->cryptlen); if (remain) return gcm_hash_remain(req, remain, gcm_hash_crypt_remain_done, flags) ?: gcm_hash_crypt_remain_continue(req, flags); return gcm_hash_crypt_remain_continue(req, flags); } static void gcm_hash_crypt_done(void *data, int err) { struct aead_request *req = data; if (err) goto out; err = gcm_hash_crypt_continue(req, 0); if (err == -EINPROGRESS) return; out: aead_request_complete(req, err); } static int gcm_hash_assoc_remain_continue(struct aead_request *req, u32 flags) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx; if (gctx->cryptlen) return gcm_hash_update(req, gcm_hash_crypt_done, gctx->src, gctx->cryptlen, flags) ?: gcm_hash_crypt_continue(req, flags); return gcm_hash_crypt_remain_continue(req, flags); } static void gcm_hash_assoc_remain_done(void *data, int err) { struct aead_request *req = data; if (err) goto out; err = gcm_hash_assoc_remain_continue(req, 0); if (err == -EINPROGRESS) return; out: aead_request_complete(req, err); } static int gcm_hash_assoc_continue(struct aead_request *req, u32 flags) { unsigned int remain; remain = gcm_remain(req->assoclen); if (remain) return gcm_hash_remain(req, remain, gcm_hash_assoc_remain_done, flags) ?: gcm_hash_assoc_remain_continue(req, flags); return gcm_hash_assoc_remain_continue(req, flags); } static void gcm_hash_assoc_done(void *data, int err) { struct aead_request *req = data; if (err) goto out; err = gcm_hash_assoc_continue(req, 0); if (err == -EINPROGRESS) return; out: aead_request_complete(req, err); } static int gcm_hash_init_continue(struct aead_request *req, u32 flags) { if (req->assoclen) return gcm_hash_update(req, gcm_hash_assoc_done, req->src, req->assoclen, flags) ?: gcm_hash_assoc_continue(req, flags); return gcm_hash_assoc_remain_continue(req, flags); } static void gcm_hash_init_done(void *data, int err) { struct aead_request *req = data; if (err) goto out; err = gcm_hash_init_continue(req, 0); if (err == -EINPROGRESS) return; out: aead_request_complete(req, err); } static int gcm_hash(struct aead_request *req, u32 flags) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct ahash_request *ahreq = &pctx->u.ahreq; struct crypto_gcm_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); ahash_request_set_tfm(ahreq, ctx->ghash); ahash_request_set_callback(ahreq, flags, gcm_hash_init_done, req); return crypto_ahash_init(ahreq) ?: gcm_hash_init_continue(req, flags); } static int gcm_enc_copy_hash(struct aead_request *req, u32 flags) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); u8 *auth_tag = pctx->auth_tag; crypto_xor(auth_tag, pctx->iauth_tag, 16); scatterwalk_map_and_copy(auth_tag, req->dst, req->assoclen + req->cryptlen, crypto_aead_authsize(aead), 1); return 0; } static int gcm_encrypt_continue(struct aead_request *req, u32 flags) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx; gctx->src = sg_next(req->src == req->dst ? pctx->src : pctx->dst); gctx->cryptlen = req->cryptlen; gctx->complete = gcm_enc_copy_hash; return gcm_hash(req, flags); } static void gcm_encrypt_done(void *data, int err) { struct aead_request *req = data; if (err) goto out; err = gcm_encrypt_continue(req, 0); if (err == -EINPROGRESS) return; out: aead_request_complete(req, err); } static int crypto_gcm_encrypt(struct aead_request *req) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct skcipher_request *skreq = &pctx->u.skreq; u32 flags = aead_request_flags(req); crypto_gcm_init_common(req); crypto_gcm_init_crypt(req, req->cryptlen); skcipher_request_set_callback(skreq, flags, gcm_encrypt_done, req); return crypto_skcipher_encrypt(skreq) ?: gcm_encrypt_continue(req, flags); } static int crypto_gcm_verify(struct aead_request *req) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); u8 *auth_tag = pctx->auth_tag; u8 *iauth_tag = pctx->iauth_tag; unsigned int authsize = crypto_aead_authsize(aead); unsigned int cryptlen = req->cryptlen - authsize; crypto_xor(auth_tag, iauth_tag, 16); scatterwalk_map_and_copy(iauth_tag, req->src, req->assoclen + cryptlen, authsize, 0); return crypto_memneq(iauth_tag, auth_tag, authsize) ? -EBADMSG : 0; } static void gcm_decrypt_done(void *data, int err) { struct aead_request *req = data; if (!err) err = crypto_gcm_verify(req); aead_request_complete(req, err); } static int gcm_dec_hash_continue(struct aead_request *req, u32 flags) { struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct skcipher_request *skreq = &pctx->u.skreq; struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx; crypto_gcm_init_crypt(req, gctx->cryptlen); skcipher_request_set_callback(skreq, flags, gcm_decrypt_done, req); return crypto_skcipher_decrypt(skreq) ?: crypto_gcm_verify(req); } static int crypto_gcm_decrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req); struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx; unsigned int authsize = crypto_aead_authsize(aead); unsigned int cryptlen = req->cryptlen; u32 flags = aead_request_flags(req); cryptlen -= authsize; crypto_gcm_init_common(req); gctx->src = sg_next(pctx->src); gctx->cryptlen = cryptlen; gctx->complete = gcm_dec_hash_continue; return gcm_hash(req, flags); } static int crypto_gcm_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct gcm_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_gcm_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_skcipher *ctr; struct crypto_ahash *ghash; unsigned long align; int err; ghash = crypto_spawn_ahash(&ictx->ghash); if (IS_ERR(ghash)) return PTR_ERR(ghash); ctr = crypto_spawn_skcipher(&ictx->ctr); err = PTR_ERR(ctr); if (IS_ERR(ctr)) goto err_free_hash; ctx->ctr = ctr; ctx->ghash = ghash; align = crypto_aead_alignmask(tfm); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize(tfm, align + offsetof(struct crypto_gcm_req_priv_ctx, u) + max(sizeof(struct skcipher_request) + crypto_skcipher_reqsize(ctr), sizeof(struct ahash_request) + crypto_ahash_reqsize(ghash))); return 0; err_free_hash: crypto_free_ahash(ghash); return err; } static void crypto_gcm_exit_tfm(struct crypto_aead *tfm) { struct crypto_gcm_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_ahash(ctx->ghash); crypto_free_skcipher(ctx->ctr); } static void crypto_gcm_free(struct aead_instance *inst) { struct gcm_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_skcipher(&ctx->ctr); crypto_drop_ahash(&ctx->ghash); kfree(inst); } static int crypto_gcm_create_common(struct crypto_template *tmpl, struct rtattr **tb, const char *ctr_name, const char *ghash_name) { struct skcipher_alg_common *ctr; u32 mask; struct aead_instance *inst; struct gcm_instance_ctx *ctx; struct hash_alg_common *ghash; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = aead_instance_ctx(inst); err = crypto_grab_ahash(&ctx->ghash, aead_crypto_instance(inst), ghash_name, 0, mask); if (err) goto err_free_inst; ghash = crypto_spawn_ahash_alg(&ctx->ghash); err = -EINVAL; if (strcmp(ghash->base.cra_name, "ghash") != 0 || ghash->digestsize != 16) goto err_free_inst; err = crypto_grab_skcipher(&ctx->ctr, aead_crypto_instance(inst), ctr_name, 0, mask); if (err) goto err_free_inst; ctr = crypto_spawn_skcipher_alg_common(&ctx->ctr); /* The skcipher algorithm must be CTR mode, using 16-byte blocks. */ err = -EINVAL; if (strncmp(ctr->base.cra_name, "ctr(", 4) != 0 || ctr->ivsize != 16 || ctr->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "gcm(%s", ctr->base.cra_name + 4) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "gcm_base(%s,%s)", ctr->base.cra_driver_name, ghash->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = (ghash->base.cra_priority + ctr->base.cra_priority) / 2; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = ctr->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct crypto_gcm_ctx); inst->alg.ivsize = GCM_AES_IV_SIZE; inst->alg.chunksize = ctr->chunksize; inst->alg.maxauthsize = 16; inst->alg.init = crypto_gcm_init_tfm; inst->alg.exit = crypto_gcm_exit_tfm; inst->alg.setkey = crypto_gcm_setkey; inst->alg.setauthsize = crypto_gcm_setauthsize; inst->alg.encrypt = crypto_gcm_encrypt; inst->alg.decrypt = crypto_gcm_decrypt; inst->free = crypto_gcm_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_gcm_free(inst); } return err; } static int crypto_gcm_create(struct crypto_template *tmpl, struct rtattr **tb) { const char *cipher_name; char ctr_name[CRYPTO_MAX_ALG_NAME]; cipher_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(cipher_name)) return PTR_ERR(cipher_name); if (snprintf(ctr_name, CRYPTO_MAX_ALG_NAME, "ctr(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; return crypto_gcm_create_common(tmpl, tb, ctr_name, "ghash"); } static int crypto_gcm_base_create(struct crypto_template *tmpl, struct rtattr **tb) { const char *ctr_name; const char *ghash_name; ctr_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(ctr_name)) return PTR_ERR(ctr_name); ghash_name = crypto_attr_alg_name(tb[2]); if (IS_ERR(ghash_name)) return PTR_ERR(ghash_name); return crypto_gcm_create_common(tmpl, tb, ctr_name, ghash_name); } static int crypto_rfc4106_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct crypto_rfc4106_ctx *ctx = crypto_aead_ctx(parent); struct crypto_aead *child = ctx->child; if (keylen < 4) return -EINVAL; keylen -= 4; memcpy(ctx->nonce, key + keylen, 4); crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_aead_setkey(child, key, keylen); } static int crypto_rfc4106_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct crypto_rfc4106_ctx *ctx = crypto_aead_ctx(parent); int err; err = crypto_rfc4106_check_authsize(authsize); if (err) return err; return crypto_aead_setauthsize(ctx->child, authsize); } static struct aead_request *crypto_rfc4106_crypt(struct aead_request *req) { struct crypto_rfc4106_req_ctx *rctx = aead_request_ctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_rfc4106_ctx *ctx = crypto_aead_ctx(aead); struct aead_request *subreq = &rctx->subreq; struct crypto_aead *child = ctx->child; struct scatterlist *sg; u8 *iv = PTR_ALIGN((u8 *)(subreq + 1) + crypto_aead_reqsize(child), crypto_aead_alignmask(child) + 1); scatterwalk_map_and_copy(iv + GCM_AES_IV_SIZE, req->src, 0, req->assoclen - 8, 0); memcpy(iv, ctx->nonce, 4); memcpy(iv + 4, req->iv, 8); sg_init_table(rctx->src, 3); sg_set_buf(rctx->src, iv + GCM_AES_IV_SIZE, req->assoclen - 8); sg = scatterwalk_ffwd(rctx->src + 1, req->src, req->assoclen); if (sg != rctx->src + 1) sg_chain(rctx->src, 2, sg); if (req->src != req->dst) { sg_init_table(rctx->dst, 3); sg_set_buf(rctx->dst, iv + GCM_AES_IV_SIZE, req->assoclen - 8); sg = scatterwalk_ffwd(rctx->dst + 1, req->dst, req->assoclen); if (sg != rctx->dst + 1) sg_chain(rctx->dst, 2, sg); } aead_request_set_tfm(subreq, child); aead_request_set_callback(subreq, req->base.flags, req->base.complete, req->base.data); aead_request_set_crypt(subreq, rctx->src, req->src == req->dst ? rctx->src : rctx->dst, req->cryptlen, iv); aead_request_set_ad(subreq, req->assoclen - 8); return subreq; } static int crypto_rfc4106_encrypt(struct aead_request *req) { int err; err = crypto_ipsec_check_assoclen(req->assoclen); if (err) return err; req = crypto_rfc4106_crypt(req); return crypto_aead_encrypt(req); } static int crypto_rfc4106_decrypt(struct aead_request *req) { int err; err = crypto_ipsec_check_assoclen(req->assoclen); if (err) return err; req = crypto_rfc4106_crypt(req); return crypto_aead_decrypt(req); } static int crypto_rfc4106_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct crypto_aead_spawn *spawn = aead_instance_ctx(inst); struct crypto_rfc4106_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *aead; unsigned long align; aead = crypto_spawn_aead(spawn); if (IS_ERR(aead)) return PTR_ERR(aead); ctx->child = aead; align = crypto_aead_alignmask(aead); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, sizeof(struct crypto_rfc4106_req_ctx) + ALIGN(crypto_aead_reqsize(aead), crypto_tfm_ctx_alignment()) + align + 24); return 0; } static void crypto_rfc4106_exit_tfm(struct crypto_aead *tfm) { struct crypto_rfc4106_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void crypto_rfc4106_free(struct aead_instance *inst) { crypto_drop_aead(aead_instance_ctx(inst)); kfree(inst); } static int crypto_rfc4106_create(struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct aead_instance *inst; struct crypto_aead_spawn *spawn; struct aead_alg *alg; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = aead_instance_ctx(inst); err = crypto_grab_aead(spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(spawn); err = -EINVAL; /* Underlying IV size must be 12. */ if (crypto_aead_alg_ivsize(alg) != GCM_AES_IV_SIZE) goto err_free_inst; /* Not a stream cipher? */ if (alg->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "rfc4106(%s)", alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME || snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "rfc4106(%s)", alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = alg->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc4106_ctx); inst->alg.ivsize = GCM_RFC4106_IV_SIZE; inst->alg.chunksize = crypto_aead_alg_chunksize(alg); inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(alg); inst->alg.init = crypto_rfc4106_init_tfm; inst->alg.exit = crypto_rfc4106_exit_tfm; inst->alg.setkey = crypto_rfc4106_setkey; inst->alg.setauthsize = crypto_rfc4106_setauthsize; inst->alg.encrypt = crypto_rfc4106_encrypt; inst->alg.decrypt = crypto_rfc4106_decrypt; inst->free = crypto_rfc4106_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_rfc4106_free(inst); } return err; } static int crypto_rfc4543_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(parent); struct crypto_aead *child = ctx->child; if (keylen < 4) return -EINVAL; keylen -= 4; memcpy(ctx->nonce, key + keylen, 4); crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_aead_setkey(child, key, keylen); } static int crypto_rfc4543_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(parent); if (authsize != 16) return -EINVAL; return crypto_aead_setauthsize(ctx->child, authsize); } static int crypto_rfc4543_crypt(struct aead_request *req, bool enc) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead); struct crypto_rfc4543_req_ctx *rctx = aead_request_ctx(req); struct aead_request *subreq = &rctx->subreq; unsigned int authsize = crypto_aead_authsize(aead); u8 *iv = PTR_ALIGN((u8 *)(rctx + 1) + crypto_aead_reqsize(ctx->child), crypto_aead_alignmask(ctx->child) + 1); if (req->src != req->dst) { unsigned int nbytes = req->assoclen + req->cryptlen - (enc ? 0 : authsize); memcpy_sglist(req->dst, req->src, nbytes); } memcpy(iv, ctx->nonce, 4); memcpy(iv + 4, req->iv, 8); aead_request_set_tfm(subreq, ctx->child); aead_request_set_callback(subreq, req->base.flags, req->base.complete, req->base.data); aead_request_set_crypt(subreq, req->src, req->dst, enc ? 0 : authsize, iv); aead_request_set_ad(subreq, req->assoclen + req->cryptlen - subreq->cryptlen); return enc ? crypto_aead_encrypt(subreq) : crypto_aead_decrypt(subreq); } static int crypto_rfc4543_encrypt(struct aead_request *req) { return crypto_ipsec_check_assoclen(req->assoclen) ?: crypto_rfc4543_crypt(req, true); } static int crypto_rfc4543_decrypt(struct aead_request *req) { return crypto_ipsec_check_assoclen(req->assoclen) ?: crypto_rfc4543_crypt(req, false); } static int crypto_rfc4543_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct crypto_rfc4543_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_aead_spawn *spawn = &ictx->aead; struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *aead; unsigned long align; aead = crypto_spawn_aead(spawn); if (IS_ERR(aead)) return PTR_ERR(aead); ctx->child = aead; align = crypto_aead_alignmask(aead); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, sizeof(struct crypto_rfc4543_req_ctx) + ALIGN(crypto_aead_reqsize(aead), crypto_tfm_ctx_alignment()) + align + GCM_AES_IV_SIZE); return 0; } static void crypto_rfc4543_exit_tfm(struct crypto_aead *tfm) { struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void crypto_rfc4543_free(struct aead_instance *inst) { struct crypto_rfc4543_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_aead(&ctx->aead); kfree(inst); } static int crypto_rfc4543_create(struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct aead_instance *inst; struct aead_alg *alg; struct crypto_rfc4543_instance_ctx *ctx; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = aead_instance_ctx(inst); err = crypto_grab_aead(&ctx->aead, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(&ctx->aead); err = -EINVAL; /* Underlying IV size must be 12. */ if (crypto_aead_alg_ivsize(alg) != GCM_AES_IV_SIZE) goto err_free_inst; /* Not a stream cipher? */ if (alg->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "rfc4543(%s)", alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME || snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "rfc4543(%s)", alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = alg->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc4543_ctx); inst->alg.ivsize = GCM_RFC4543_IV_SIZE; inst->alg.chunksize = crypto_aead_alg_chunksize(alg); inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(alg); inst->alg.init = crypto_rfc4543_init_tfm; inst->alg.exit = crypto_rfc4543_exit_tfm; inst->alg.setkey = crypto_rfc4543_setkey; inst->alg.setauthsize = crypto_rfc4543_setauthsize; inst->alg.encrypt = crypto_rfc4543_encrypt; inst->alg.decrypt = crypto_rfc4543_decrypt; inst->free = crypto_rfc4543_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_rfc4543_free(inst); } return err; } static struct crypto_template crypto_gcm_tmpls[] = { { .name = "gcm_base", .create = crypto_gcm_base_create, .module = THIS_MODULE, }, { .name = "gcm", .create = crypto_gcm_create, .module = THIS_MODULE, }, { .name = "rfc4106", .create = crypto_rfc4106_create, .module = THIS_MODULE, }, { .name = "rfc4543", .create = crypto_rfc4543_create, .module = THIS_MODULE, }, }; static int __init crypto_gcm_module_init(void) { int err; gcm_zeroes = kzalloc(sizeof(*gcm_zeroes), GFP_KERNEL); if (!gcm_zeroes) return -ENOMEM; sg_init_one(&gcm_zeroes->sg, gcm_zeroes->buf, sizeof(gcm_zeroes->buf)); err = crypto_register_templates(crypto_gcm_tmpls, ARRAY_SIZE(crypto_gcm_tmpls)); if (err) kfree(gcm_zeroes); return err; } static void __exit crypto_gcm_module_exit(void) { kfree(gcm_zeroes); crypto_unregister_templates(crypto_gcm_tmpls, ARRAY_SIZE(crypto_gcm_tmpls)); } module_init(crypto_gcm_module_init); module_exit(crypto_gcm_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Galois/Counter Mode"); MODULE_AUTHOR("Mikko Herranen <mh1@iki.fi>"); MODULE_ALIAS_CRYPTO("gcm_base"); MODULE_ALIAS_CRYPTO("rfc4106"); MODULE_ALIAS_CRYPTO("rfc4543"); MODULE_ALIAS_CRYPTO("gcm");
19 12 1 6 13 5 1 26 131 9 18 15 15 132 2 169 39 21 59 59 59 120 12 112 9 152 154 150 3 140 2 137 165 165 150 12 52 94 137 1 1 2 137 390 1 19 134 138 138 95 386 1 389 386 388 214 18 9 128 58 9 174 25 142 1 44 87 36 163 22 150 150 16 138 138 137 138 39 23 24 21 1 22 22 20 9 21 1 1 12 10 20 9 5 6 6 6 2 4 2 4 6 4 2 6 6 6 1 1 5 6 6 6 6 113 113 62 17 97 6 82 23 265 262 262 2 264 265 246 246 241 166 6 1 1 49 2 2 1 15 109 18 20 308 7 2 4 1 538 538 538 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 // SPDX-License-Identifier: GPL-2.0-or-later /* * Internet Control Message Protocol (ICMPv6) * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on net/ipv4/icmp.c * * RFC 1885 */ /* * Changes: * * Andi Kleen : exception handling * Andi Kleen add rate limits. never reply to a icmp. * add more length checks and other fixes. * yoshfuji : ensure to sent parameter problem for * fragments. * YOSHIFUJI Hideaki @USAGI: added sysctl for icmp rate limit. * Randy Dunlap and * YOSHIFUJI Hideaki @USAGI: Per-interface statistics support * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/netfilter.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/icmpv6.h> #include <net/ip.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/ip6_checksum.h> #include <net/ping.h> #include <net/protocol.h> #include <net/raw.h> #include <net/rawv6.h> #include <net/seg6.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/inet_common.h> #include <net/dsfield.h> #include <net/l3mdev.h> #include <linux/uaccess.h> static DEFINE_PER_CPU(struct sock *, ipv6_icmp_sk); static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); struct net *net = dev_net_rcu(skb->dev); if (type == ICMPV6_PKT_TOOBIG) ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) ping_err(skb, offset, ntohl(info)); return 0; } static int icmpv6_rcv(struct sk_buff *skb); static const struct inet6_protocol icmpv6_protocol = { .handler = icmpv6_rcv, .err_handler = icmpv6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; /* Called with BH disabled */ static struct sock *icmpv6_xmit_lock(struct net *net) { struct sock *sk; sk = this_cpu_read(ipv6_icmp_sk); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path (f.e. SIT or * ip6ip6 tunnel) signals dst_link_failure() for an * outgoing ICMP6 packet. */ return NULL; } sock_net_set(sk, net); return sk; } static void icmpv6_xmit_unlock(struct sock *sk) { sock_net_set(sk, &init_net); spin_unlock(&sk->sk_lock.slock); } /* * Figure out, may we reply to this packet with icmp error. * * We do not reply, if: * - it was icmp error message. * - it is truncated, so that it is known, that protocol is ICMPV6 * (i.e. in the middle of some exthdr) * * --ANK (980726) */ static bool is_ineligible(const struct sk_buff *skb) { int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; int len = skb->len - ptr; __u8 nexthdr = ipv6_hdr(skb)->nexthdr; __be16 frag_off; if (len < 0) return true; ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off); if (ptr < 0) return false; if (nexthdr == IPPROTO_ICMPV6) { u8 _type, *tp; tp = skb_header_pointer(skb, ptr+offsetof(struct icmp6hdr, icmp6_type), sizeof(_type), &_type); /* Based on RFC 8200, Section 4.5 Fragment Header, return * false if this is a fragment packet with no icmp header info. */ if (!tp && frag_off != 0) return false; else if (!tp || !(*tp & ICMPV6_INFOMSG_MASK)) return true; } return false; } static bool icmpv6_mask_allow(struct net *net, int type) { if (type > ICMPV6_MSG_MAX) return true; /* Limit if icmp type is set in ratemask. */ if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask)) return true; return false; } static bool icmpv6_global_allow(struct net *net, int type, bool *apply_ratelimit) { if (icmpv6_mask_allow(net, type)) return true; if (icmp_global_allow(net)) { *apply_ratelimit = true; return true; } __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } /* * Check the ICMP output rate limit */ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct flowi6 *fl6, bool apply_ratelimit) { struct net *net = sock_net(sk); struct net_device *dev; struct dst_entry *dst; bool res = false; if (!apply_ratelimit) return true; /* * Look up the output route. * XXX: perhaps the expire for routing entries cloned by * this lookup should be more aggressive (not longer than timeout). */ dst = ip6_route_output(net, sk, fl6); rcu_read_lock(); dev = dst_dev_rcu(dst); if (dst->error) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); } else if (dev && (dev->flags & IFF_LOOPBACK)) { res = true; } else { struct rt6_info *rt = dst_rt6_info(dst); int tmo = net->ipv6.sysctl.icmpv6_time; struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); } rcu_read_unlock(); if (!res) __ICMP6_INC_STATS(net, NULL, ICMP6_MIB_RATELIMITHOST); else icmp_global_consume(net); dst_release(dst); return res; } static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type, struct flowi6 *fl6) { struct net *net = sock_net(sk); struct dst_entry *dst; bool res = false; dst = ip6_route_output(net, sk, fl6); if (!dst->error) { struct rt6_info *rt = dst_rt6_info(dst); struct in6_addr prefsrc; rt6_get_prefsrc(rt, &prefsrc); res = !ipv6_addr_any(&prefsrc); } dst_release(dst); return res; } /* * an inline helper for the "simple" if statement below * checks if parameter problem report is caused by an * unrecognized IPv6 option that has the Option Type * highest-order two bits set to 10 */ static bool opt_unrec(struct sk_buff *skb, __u32 offset) { u8 _optval, *op; offset += skb_network_offset(skb); op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); if (!op) return true; return (*op & 0xC0) == 0x80; } void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len) { struct sk_buff *skb; struct icmp6hdr *icmp6h; skb = skb_peek(&sk->sk_write_queue); if (!skb) return; icmp6h = icmp6_hdr(skb); memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); icmp6h->icmp6_cksum = 0; if (skb_queue_len(&sk->sk_write_queue) == 1) { skb->csum = csum_partial(icmp6h, sizeof(struct icmp6hdr), skb->csum); icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, len, fl6->flowi6_proto, skb->csum); } else { __wsum tmp_csum = 0; skb_queue_walk(&sk->sk_write_queue, skb) { tmp_csum = csum_add(tmp_csum, skb->csum); } tmp_csum = csum_partial(icmp6h, sizeof(struct icmp6hdr), tmp_csum); icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, len, fl6->flowi6_proto, tmp_csum); } ip6_push_pending_frames(sk); } struct icmpv6_msg { struct sk_buff *skb; int offset; uint8_t type; }; static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct icmpv6_msg *msg = (struct icmpv6_msg *) from; struct sk_buff *org_skb = msg->skb; __wsum csum; csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset, to, len); skb->csum = csum_block_add(skb->csum, csum, odd); if (!(msg->type & ICMPV6_INFOMSG_MASK)) nf_ct_attach(skb, org_skb); return 0; } #if IS_ENABLED(CONFIG_IPV6_MIP6) static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) { struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hao *hao; int off; if (opt->dsthao) { off = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); if (likely(off >= 0)) { hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + off); swap(iph->saddr, hao->addr); } } } #else static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {} #endif static struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *skb, struct sock *sk, struct flowi6 *fl6) { struct dst_entry *dst, *dst2; struct flowi6 fl2; int err; err = ip6_dst_lookup(net, sk, &dst, fl6); if (err) return ERR_PTR(err); /* * We won't send icmp if the destination is known * anycast unless we need to treat anycast as unicast. */ if (!READ_ONCE(net->ipv6.sysctl.icmpv6_error_anycast_as_unicast) && ipv6_anycast_destination(dst, &fl6->daddr)) { net_dbg_ratelimited("icmp6_send: acast source\n"); dst_release(dst); return ERR_PTR(-EINVAL); } /* No need to clone since we're just using its address. */ dst2 = dst; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0); if (!IS_ERR(dst)) { if (dst != dst2) return dst; } else { if (PTR_ERR(dst) == -EPERM) dst = NULL; else return dst; } err = xfrm_decode_session_reverse(net, skb, flowi6_to_flowi(&fl2), AF_INET6); if (err) goto relookup_failed; err = ip6_dst_lookup(net, sk, &dst2, &fl2); if (err) goto relookup_failed; dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP); if (!IS_ERR(dst2)) { dst_release(dst); dst = dst2; } else { err = PTR_ERR(dst2); if (err == -EPERM) { dst_release(dst); return dst2; } else goto relookup_failed; } relookup_failed: if (dst) return dst; return ERR_PTR(err); } static struct net_device *icmp6_dev(const struct sk_buff *skb) { struct net_device *dev = skb->dev; /* for local traffic to local address, skb dev is the loopback * device. Check if there is a dst attached to the skb and if so * get the real device index. Same is needed for replies to a link * local address on a device enslaved to an L3 master device */ if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { const struct rt6_info *rt6 = skb_rt6_info(skb); /* The destination could be an external IP in Ext Hdr (SRv6, RPL, etc.), * and ip6_null_entry could be set to skb if no route is found. */ if (rt6 && rt6->rt6i_idev) dev = rt6->rt6i_idev->dev; } return dev; } static int icmp6_iif(const struct sk_buff *skb) { return icmp6_dev(skb)->ifindex; } struct icmp6_ext_iio_addr6_subobj { __be16 afi; __be16 reserved; struct in6_addr addr6; }; static unsigned int icmp6_ext_iio_len(void) { return sizeof(struct icmp_extobj_hdr) + /* ifIndex */ sizeof(__be32) + /* Interface Address Sub-Object */ sizeof(struct icmp6_ext_iio_addr6_subobj) + /* Interface Name Sub-Object. Length must be a multiple of 4 * bytes. */ ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) + /* MTU */ sizeof(__be32); } static unsigned int icmp6_ext_max_len(u8 ext_objs) { unsigned int ext_max_len; ext_max_len = sizeof(struct icmp_ext_hdr); if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) ext_max_len += icmp6_ext_iio_len(); return ext_max_len; } static struct in6_addr *icmp6_ext_iio_addr6_find(const struct net_device *dev) { struct inet6_dev *in6_dev; struct inet6_ifaddr *ifa; in6_dev = __in6_dev_get(dev); if (!in6_dev) return NULL; /* It is unclear from RFC 5837 which IP address should be chosen, but * it makes sense to choose a global unicast address. */ list_for_each_entry_rcu(ifa, &in6_dev->addr_list, if_list) { if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DADFAILED)) continue; if (ipv6_addr_type(&ifa->addr) != IPV6_ADDR_UNICAST || ipv6_addr_src_scope(&ifa->addr) != IPV6_ADDR_SCOPE_GLOBAL) continue; return &ifa->addr; } return NULL; } static void icmp6_ext_iio_iif_append(struct net *net, struct sk_buff *skb, int iif) { struct icmp_ext_iio_name_subobj *name_subobj; struct icmp_extobj_hdr *objh; struct net_device *dev; struct in6_addr *addr6; __be32 data; if (!iif) return; /* Add the fields in the order specified by RFC 5837. */ objh = skb_put(skb, sizeof(*objh)); objh->class_num = ICMP_EXT_OBJ_CLASS_IIO; objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF); data = htonl(iif); skb_put_data(skb, &data, sizeof(__be32)); objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX; rcu_read_lock(); dev = dev_get_by_index_rcu(net, iif); if (!dev) goto out; addr6 = icmp6_ext_iio_addr6_find(dev); if (addr6) { struct icmp6_ext_iio_addr6_subobj *addr6_subobj; addr6_subobj = skb_put_zero(skb, sizeof(*addr6_subobj)); addr6_subobj->afi = htons(ICMP_AFI_IP6); addr6_subobj->addr6 = *addr6; objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR; } name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4)); name_subobj->len = ALIGN(sizeof(*name_subobj), 4); netdev_copy_name(dev, name_subobj->name); objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME; data = htonl(READ_ONCE(dev->mtu)); skb_put_data(skb, &data, sizeof(__be32)); objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU; out: rcu_read_unlock(); objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh); } static void icmp6_ext_objs_append(struct net *net, struct sk_buff *skb, u8 ext_objs, int iif) { if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) icmp6_ext_iio_iif_append(net, skb, iif); } static struct sk_buff * icmp6_ext_append(struct net *net, struct sk_buff *skb_in, struct icmp6hdr *icmp6h, unsigned int room, int iif) { unsigned int payload_len, ext_max_len, ext_len; struct icmp_ext_hdr *ext_hdr; struct sk_buff *skb; u8 ext_objs; int nhoff; switch (icmp6h->icmp6_type) { case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: break; default: return NULL; } /* Do not overwrite existing extensions. This can happen when we * receive an ICMPv4 message with extensions from a tunnel and * translate it to an ICMPv6 message towards an IPv6 host in the * overlay network. */ if (icmp6h->icmp6_datagram_len) return NULL; ext_objs = READ_ONCE(net->ipv6.sysctl.icmpv6_errors_extension_mask); if (!ext_objs) return NULL; ext_max_len = icmp6_ext_max_len(ext_objs); if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room) return NULL; skb = skb_clone(skb_in, GFP_ATOMIC); if (!skb) return NULL; nhoff = skb_network_offset(skb); payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN); if (!pskb_network_may_pull(skb, payload_len)) goto free_skb; if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) || __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false)) goto free_skb; if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC)) goto free_skb; ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr)); ext_hdr->version = ICMP_EXT_VERSION_2; icmp6_ext_objs_append(net, skb, ext_objs, iif); /* Do not send an empty extension structure. */ ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr; if (ext_len == sizeof(*ext_hdr)) goto free_skb; ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len); /* The length of the original datagram in 64-bit words (RFC 4884). */ icmp6h->icmp6_datagram_len = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u64); return skb; free_skb: consume_skb(skb); return NULL; } /* * Send an ICMP message in response to a packet in error */ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm) { struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); struct sock *sk; struct net *net; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; bool apply_ratelimit = false; struct sk_buff *ext_skb; struct dst_entry *dst; unsigned int room; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; struct ipcm6_cookie ipc6; int iif = 0; int addr_type = 0; int len; u32 mark; if ((u8 *)hdr < skb->head || (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb)) return; if (!skb->dev) return; rcu_read_lock(); net = dev_net_rcu(skb->dev); mark = IP6_REPLY_MARK(net, skb->mark); /* * Make sure we respect the rules * i.e. RFC 1885 2.4(e) * Rule (e.1) is enforced by not using icmp6_send * in any code that processes icmp errors. */ addr_type = ipv6_addr_type(&hdr->daddr); if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0) || ipv6_chk_acast_addr_src(net, skb->dev, &hdr->daddr)) saddr = &hdr->daddr; /* * Dest addr check */ if (addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST) { if (type != ICMPV6_PKT_TOOBIG && !(type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_OPTION && (opt_unrec(skb, info)))) goto out; saddr = NULL; } addr_type = ipv6_addr_type(&hdr->saddr); /* * Source addr check */ if (__ipv6_addr_needs_scope_id(addr_type)) { iif = icmp6_iif(skb); } else { /* * The source device is used for looking up which routing table * to use for sending an ICMP error. */ iif = l3mdev_master_ifindex(skb->dev); } /* * Must not send error if the source does not uniquely * identify a single node (RFC2463 Section 2.4). * We check unspecified / multicast addresses here, * and anycast addresses will be checked later. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); goto out; } /* * Never answer to a ICMP packet. */ if (is_ineligible(skb)) { net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); goto out; } /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type, &apply_ratelimit)) goto out_bh_enable; mip6_addr_swap(skb, parm); sk = icmpv6_xmit_lock(net); if (!sk) goto out_bh_enable; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = hdr->saddr; if (force_saddr) saddr = force_saddr; if (saddr) { fl6.saddr = *saddr; } else if (!icmpv6_rt_has_prefsrc(sk, type, &fl6)) { /* select a more meaningful saddr from input if */ struct net_device *in_netdev; in_netdev = dev_get_by_index(net, parm->iif); if (in_netdev) { ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, inet6_sk(sk)->srcprefs, &fl6.saddr); dev_put(in_netdev); } } fl6.flowi6_mark = mark; fl6.flowi6_oif = iif; fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); np = inet6_sk(sk); if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) goto out_unlock; tmp_hdr.icmp6_type = type; tmp_hdr.icmp6_code = code; tmp_hdr.icmp6_cksum = 0; tmp_hdr.icmp6_pointer = htonl(info); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); ipcm6_init_sk(&ipc6, sk); ipc6.sockc.mark = mark; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) goto out_unlock; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); msg.skb = skb; msg.offset = skb_network_offset(skb); msg.type = type; room = IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr); ext_skb = icmp6_ext_append(net, skb, &tmp_hdr, room, parm->iif); if (ext_skb) msg.skb = ext_skb; len = msg.skb->len - msg.offset; len = min_t(unsigned int, len, room); if (len < 0) { net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); goto out_dst_release; } idev = __in6_dev_get(skb->dev); if (ip6_append_data(sk, icmpv6_getfrag, &msg, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, dst_rt6_info(dst), MSG_DONTWAIT)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr)); } out_dst_release: if (ext_skb) consume_skb(ext_skb); dst_release(dst); out_unlock: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); out: rcu_read_unlock(); } EXPORT_SYMBOL(icmp6_send); /* Slightly more convenient version of icmp6_send with drop reasons. */ void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos, enum skb_drop_reason reason) { icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb)); kfree_skb_reason(skb, reason); } /* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH * if sufficient data bytes are available * @nhs is the size of the tunnel header(s) : * Either an IPv4 header for SIT encap * an IPv4 header + GRE header for GRE encap */ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len) { struct in6_addr temp_saddr; struct rt6_info *rt; struct sk_buff *skb2; u32 info = 0; if (!pskb_may_pull(skb, nhs + sizeof(struct ipv6hdr) + 8)) return 1; /* RFC 4884 (partial) support for ICMP extensions */ if (data_len < 128 || (data_len & 7) || skb->len < data_len) data_len = 0; skb2 = data_len ? skb_copy(skb, GFP_ATOMIC) : skb_clone(skb, GFP_ATOMIC); if (!skb2) return 1; skb_dst_drop(skb2); skb_pull(skb2, nhs); skb_reset_network_header(skb2); rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &temp_saddr); if (data_len) { /* RFC 4884 (partial) support : * insert 0 padding at the end, before the extensions */ __skb_push(skb2, nhs); skb_reset_network_header(skb2); memmove(skb2->data, skb2->data + nhs, data_len - nhs); memset(skb2->data + data_len - nhs, 0, nhs); /* RFC 4884 4.5 : Length is measured in 64-bit words, * and stored in reserved[0] */ info = (data_len/8) << 24; } if (type == ICMP_TIME_EXCEEDED) icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, info, &temp_saddr, IP6CB(skb2)); else icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, info, &temp_saddr, IP6CB(skb2)); if (rt) ip6_rt_put(rt); kfree_skb(skb2); return 0; } EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) { struct net *net = dev_net_rcu(skb->dev); struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; struct icmp6hdr *icmph = icmp6_hdr(skb); bool apply_ratelimit = false; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); SKB_DR(reason); bool acast; u8 type; if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) && net->ipv6.sysctl.icmpv6_echo_ignore_multicast) return reason; saddr = &ipv6_hdr(skb)->daddr; acast = ipv6_anycast_destination(skb_dst(skb), saddr); if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast) return reason; if (!ipv6_unicast_destination(skb) && !(net->ipv6.sysctl.anycast_src_echo_reply && acast)) saddr = NULL; if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST) type = ICMPV6_EXT_ECHO_REPLY; else type = ICMPV6_ECHO_REPLY; memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); tmp_hdr.icmp6_type = type; memset(&fl6, 0, sizeof(fl6)); if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) fl6.saddr = *saddr; fl6.flowi6_oif = icmp6_iif(skb); fl6.fl6_icmp_type = type; fl6.flowi6_mark = mark; fl6.flowi6_uid = sock_net_uid(net, NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); local_bh_disable(); sk = icmpv6_xmit_lock(net); if (!sk) goto out_bh_enable; np = inet6_sk(sk); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); if (ip6_dst_lookup(net, sk, &dst, &fl6)) goto out; dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) goto out; /* Check the ratelimit */ if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) || !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit)) goto out_dst_release; idev = __in6_dev_get(skb->dev); msg.skb = skb; msg.offset = 0; msg.type = type; ipcm6_init_sk(&ipc6, sk); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); ipc6.sockc.mark = mark; if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST) if (!icmp_build_probe(skb, (struct icmphdr *)&tmp_hdr)) goto out_dst_release; if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, dst_rt6_info(dst), MSG_DONTWAIT)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); reason = SKB_CONSUMED; } out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); return reason; } enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net_rcu(skb->dev); const struct inet6_protocol *ipprot; enum skb_drop_reason reason; int inner_offset; __be16 frag_off; u8 nexthdr; reason = pskb_may_pull_reason(skb, sizeof(struct ipv6hdr)); if (reason != SKB_NOT_DROPPED_YET) goto out; seg6_icmp_srh(skb, opt); nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; if (ipv6_ext_hdr(nexthdr)) { /* now skip over extension headers */ inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (inner_offset < 0) { SKB_DR_SET(reason, IPV6_BAD_EXTHDR); goto out; } } else { inner_offset = sizeof(struct ipv6hdr); } /* Checkin header including 8 bytes of inner protocol header. */ reason = pskb_may_pull_reason(skb, inner_offset + 8); if (reason != SKB_NOT_DROPPED_YET) goto out; /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. Without this we will not able f.e. to make source routed pmtu discovery. Corresponding argument (opt) to notifiers is already added. --ANK (980726) */ ipprot = rcu_dereference(inet6_protos[nexthdr]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, opt, type, code, inner_offset, info); raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info); return SKB_CONSUMED; out: __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return reason; } /* * Handle icmp messages */ static int icmpv6_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net *net = dev_net_rcu(skb->dev); struct net_device *dev = icmp6_dev(skb); struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; struct icmp6hdr *hdr; u8 type; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); int nh; if (!(sp && sp->xvec[sp->len - 1]->props.flags & XFRM_STATE_ICMP)) { reason = SKB_DROP_REASON_XFRM_POLICY; goto drop_no_count; } if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr))) goto drop_no_count; nh = skb_network_offset(skb); skb_set_network_header(skb, sizeof(*hdr)); if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN, skb)) { reason = SKB_DROP_REASON_XFRM_POLICY; goto drop_no_count; } skb_set_network_header(skb, nh); } __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) { net_dbg_ratelimited("ICMPv6 checksum failed [%pI6c > %pI6c]\n", saddr, daddr); goto csum_error; } if (!pskb_pull(skb, sizeof(*hdr))) goto discard_it; hdr = icmp6_hdr(skb); type = hdr->icmp6_type; ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all) reason = icmpv6_echo_reply(skb); break; case ICMPV6_EXT_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all && READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe)) reason = icmpv6_echo_reply(skb); break; case ICMPV6_ECHO_REPLY: case ICMPV6_EXT_ECHO_REPLY: ping_rcv(skb); return 0; case ICMPV6_PKT_TOOBIG: /* BUGGG_FUTURE: if packet contains rthdr, we cannot update standard destination cache. Seems, only "advanced" destination cache will allow to solve this problem --ANK (980726) */ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto discard_it; hdr = icmp6_hdr(skb); /* to notify */ fallthrough; case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: reason = icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); break; case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: reason = ndisc_rcv(skb); break; case ICMPV6_MGM_QUERY: igmp6_event_query(skb); return 0; case ICMPV6_MGM_REPORT: igmp6_event_report(skb); return 0; case ICMPV6_MGM_REDUCTION: case ICMPV6_NI_QUERY: case ICMPV6_NI_REPLY: case ICMPV6_MLD2_REPORT: case ICMPV6_DHAAD_REQUEST: case ICMPV6_DHAAD_REPLY: case ICMPV6_MOBILE_PREFIX_SOL: case ICMPV6_MOBILE_PREFIX_ADV: break; default: /* informational */ if (type & ICMPV6_INFOMSG_MASK) break; net_dbg_ratelimited("icmpv6: msg of unknown type [%pI6c > %pI6c]\n", saddr, daddr); /* * error of unknown type. * must pass to upper level */ reason = icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); } /* until the v6 path can be better sorted assume failure and * preserve the status quo behaviour for the rest of the paths to here */ if (reason) kfree_skb_reason(skb, reason); else consume_skb(skb); return 0; csum_error: reason = SKB_DROP_REASON_ICMP_CSUM; __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb_reason(skb, reason); return 0; } void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, const struct in6_addr *daddr, int oif) { memset(fl6, 0, sizeof(*fl6)); fl6->saddr = *saddr; fl6->daddr = *daddr; fl6->flowi6_proto = IPPROTO_ICMPV6; fl6->fl6_icmp_type = type; fl6->fl6_icmp_code = 0; fl6->flowi6_oif = oif; security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } int __init icmpv6_init(void) { struct sock *sk; int err, i; for_each_possible_cpu(i) { err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &init_net); if (err < 0) { pr_err("Failed to initialize the ICMP6 control socket (err %d)\n", err); return err; } per_cpu(ipv6_icmp_sk, i) = sk; /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); } err = -EAGAIN; if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) goto fail; err = inet6_register_icmp_sender(icmp6_send); if (err) goto sender_reg_err; return 0; sender_reg_err: inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); fail: pr_err("Failed to register ICMP6 protocol\n"); return err; } void icmpv6_cleanup(void) { inet6_unregister_icmp_sender(icmp6_send); inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); } static const struct icmp6_err { int err; int fatal; } tab_unreach[] = { { /* NOROUTE */ .err = ENETUNREACH, .fatal = 0, }, { /* ADM_PROHIBITED */ .err = EACCES, .fatal = 1, }, { /* Was NOT_NEIGHBOUR, now reserved */ .err = EHOSTUNREACH, .fatal = 0, }, { /* ADDR_UNREACH */ .err = EHOSTUNREACH, .fatal = 0, }, { /* PORT_UNREACH */ .err = ECONNREFUSED, .fatal = 1, }, { /* POLICY_FAIL */ .err = EACCES, .fatal = 1, }, { /* REJECT_ROUTE */ .err = EACCES, .fatal = 1, }, }; int icmpv6_err_convert(u8 type, u8 code, int *err) { int fatal = 0; *err = EPROTO; switch (type) { case ICMPV6_DEST_UNREACH: fatal = 1; if (code < ARRAY_SIZE(tab_unreach)) { *err = tab_unreach[code].err; fatal = tab_unreach[code].fatal; } break; case ICMPV6_PKT_TOOBIG: *err = EMSGSIZE; break; case ICMPV6_PARAMPROB: *err = EPROTO; fatal = 1; break; case ICMPV6_TIME_EXCEED: *err = EHOSTUNREACH; break; } return fatal; } EXPORT_SYMBOL(icmpv6_err_convert); #ifdef CONFIG_SYSCTL static u32 icmpv6_errors_extension_mask_all = GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); static struct ctl_table ipv6_icmp_table_template[] = { { .procname = "ratelimit", .data = &init_net.ipv6.sysctl.icmpv6_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "echo_ignore_all", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_multicast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_anycast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ratemask", .data = &init_net.ipv6.sysctl.icmpv6_ratemask_ptr, .maxlen = ICMPV6_MSG_MAX + 1, .mode = 0644, .proc_handler = proc_do_large_bitmap, }, { .procname = "error_anycast_as_unicast", .data = &init_net.ipv6.sysctl.icmpv6_error_anycast_as_unicast, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "errors_extension_mask", .data = &init_net.ipv6.sysctl.icmpv6_errors_extension_mask, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &icmpv6_errors_extension_mask_all, }, }; struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) { struct ctl_table *table; table = kmemdup(ipv6_icmp_table_template, sizeof(ipv6_icmp_table_template), GFP_KERNEL); if (table) { table[0].data = &net->ipv6.sysctl.icmpv6_time; table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all; table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast; table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr; table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast; table[6].data = &net->ipv6.sysctl.icmpv6_errors_extension_mask; } return table; } size_t ipv6_icmp_sysctl_table_size(void) { return ARRAY_SIZE(ipv6_icmp_table_template); } #endif
295 296 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 /* * net/tipc/core.c: TIPC module code * * Copyright (c) 2003-2006, 2013, Ericsson AB * Copyright (c) 2005-2006, 2010-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "name_table.h" #include "subscr.h" #include "bearer.h" #include "net.h" #include "socket.h" #include "bcast.h" #include "node.h" #include "crypto.h" #include <linux/module.h> /* configurable TIPC parameters */ unsigned int tipc_net_id __read_mostly; int sysctl_tipc_rmem[3] __read_mostly; /* min/default/max */ static int __net_init tipc_init_net(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); int err; tn->net_id = 4711; tn->node_addr = 0; tn->trial_addr = 0; tn->addr_trial_end = 0; tn->capabilities = TIPC_NODE_CAPABILITIES; INIT_WORK(&tn->work, tipc_net_finalize_work); memset(tn->node_id, 0, sizeof(tn->node_id)); memset(tn->node_id_string, 0, sizeof(tn->node_id_string)); tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; get_random_bytes(&tn->random, sizeof(int)); INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); #ifdef CONFIG_TIPC_CRYPTO err = tipc_crypto_start(&tn->crypto_tx, net, NULL); if (err) goto out_crypto; #endif err = tipc_sk_rht_init(net); if (err) goto out_sk_rht; err = tipc_nametbl_init(net); if (err) goto out_nametbl; err = tipc_bcast_init(net); if (err) goto out_bclink; err = tipc_attach_loopback(net); if (err) goto out_bclink; return 0; out_bclink: tipc_nametbl_stop(net); out_nametbl: tipc_sk_rht_destroy(net); out_sk_rht: #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&tn->crypto_tx); out_crypto: #endif return err; } static void __net_exit tipc_exit_net(struct net *net) { struct tipc_net *tn = tipc_net(net); tipc_detach_loopback(net); tipc_net_stop(net); /* Make sure the tipc_net_finalize_work() finished */ cancel_work_sync(&tn->work); tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&tipc_net(net)->crypto_tx); #endif while (atomic_read(&tn->wq_count)) cond_resched(); } static void __net_exit tipc_pernet_pre_exit(struct net *net) { tipc_node_pre_cleanup_net(net); } static struct pernet_operations tipc_pernet_pre_exit_ops = { .pre_exit = tipc_pernet_pre_exit, }; static struct pernet_operations tipc_net_ops = { .init = tipc_init_net, .exit = tipc_exit_net, .id = &tipc_net_id, .size = sizeof(struct tipc_net), }; static struct pernet_operations tipc_topsrv_net_ops = { .init = tipc_topsrv_init_net, .exit = tipc_topsrv_exit_net, }; static int __init tipc_init(void) { int err; pr_info("Activated (version " TIPC_MOD_VER ")\n"); sysctl_tipc_rmem[0] = RCVBUF_MIN; sysctl_tipc_rmem[1] = RCVBUF_DEF; sysctl_tipc_rmem[2] = RCVBUF_MAX; err = tipc_register_sysctl(); if (err) goto out_sysctl; err = register_pernet_device(&tipc_net_ops); if (err) goto out_pernet; err = tipc_socket_init(); if (err) goto out_socket; err = register_pernet_device(&tipc_topsrv_net_ops); if (err) goto out_pernet_topsrv; err = register_pernet_subsys(&tipc_pernet_pre_exit_ops); if (err) goto out_register_pernet_subsys; err = tipc_bearer_setup(); if (err) goto out_bearer; err = tipc_netlink_start(); if (err) goto out_netlink; err = tipc_netlink_compat_start(); if (err) goto out_netlink_compat; pr_info("Started in single node mode\n"); return 0; out_netlink_compat: tipc_netlink_stop(); out_netlink: tipc_bearer_cleanup(); out_bearer: unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); out_register_pernet_subsys: unregister_pernet_device(&tipc_topsrv_net_ops); out_pernet_topsrv: tipc_socket_stop(); out_socket: unregister_pernet_device(&tipc_net_ops); out_pernet: tipc_unregister_sysctl(); out_sysctl: pr_err("Unable to start in single node mode\n"); return err; } static void __exit tipc_exit(void) { tipc_netlink_compat_stop(); tipc_netlink_stop(); tipc_bearer_cleanup(); unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); unregister_pernet_device(&tipc_topsrv_net_ops); tipc_socket_stop(); unregister_pernet_device(&tipc_net_ops); tipc_unregister_sysctl(); pr_info("Deactivated\n"); } module_init(tipc_init); module_exit(tipc_exit); MODULE_DESCRIPTION("TIPC: Transparent Inter Process Communication"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(TIPC_MOD_VER);
2402 1428 8247 37 193 3707 3705 4478 4476 12007 12004 10797 6404 4467 2292 2929 2930 1510 4 6 6 3 1504 3 1505 1504 1510 1510 1148 2379 1130 1313 3 3 3 429 426 3 10419 3 10420 7023 11046 10417 11077 11074 31 11055 37 11044 1846 9977 10420 1635 9233 4 1692 42 1223 9 1186 2 2 39 1 32 1 2 5 2 1 4 427 4 429 429 4 6 764 15 12 37 3915 1844 9 1835 53 53 2148 190 4 126 26 6 39 92 111 28 47 143 128 31 8 43 171 77 91 97 9 19 12 38 21 35 2 6 8 27 4 2 5 7 1 2 7 5 3 7 2 7 5 7 5 9 2 3 2 39 56 5 20 4 2 3 27 272 3 96 22 40 45 45 2 3 11 50 2 23 2 46 108 14 2 26 8 64 13 13 53 53 53 4 1 3 105 105 108 105 3 53 52 1 53 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/bpf-cgroup.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> #include <linux/topology.h> #include <linux/ktime.h> #include <linux/sched.h> #include <linux/uidgid.h> #include <linux/filter.h> #include <linux/ctype.h> #include <linux/jiffies.h> #include <linux/pid_namespace.h> #include <linux/poison.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> #include <linux/security.h> #include <linux/btf_ids.h> #include <linux/bpf_mem_alloc.h> #include <linux/kasan.h> #include <linux/bpf_verifier.h> #include <linux/uaccess.h> #include <linux/verification.h> #include <linux/task_work.h> #include <linux/irq_work.h> #include <linux/buildid.h> #include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments * * Different map implementations will rely on rcu in map methods * lookup/update/delete, therefore eBPF programs must run under rcu lock * if program is allowed to access maps, so check rcu_read_lock_held() or * rcu_read_lock_trace_held() in all three functions. */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_update_elem(map, key, value, flags); } const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, .arg3_type = ARG_PTR_TO_MAP_VALUE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_delete_elem(map, key); } const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) { return map->ops->map_push_elem(map, value, flags); } const struct bpf_func_proto bpf_map_push_elem_proto = { .func = bpf_map_push_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE, .arg3_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) { return map->ops->map_pop_elem(map, value); } const struct bpf_func_proto bpf_map_pop_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) { return map->ops->map_peek_elem(map, value); } const struct bpf_func_proto bpf_map_peek_elem_proto = { .func = bpf_map_peek_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = { .func = bpf_map_lookup_percpu_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, .arg3_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_smp_processor_id) { return smp_processor_id(); } const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .func = bpf_get_smp_processor_id, .gpl_only = false, .ret_type = RET_INTEGER, .allow_fastcall = true, }; BPF_CALL_0(bpf_get_numa_node_id) { return numa_node_id(); } const struct bpf_func_proto bpf_get_numa_node_id_proto = { .func = bpf_get_numa_node_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_boot_ns) { /* NMI safe access to clock boottime */ return ktime_get_boot_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { .func = bpf_ktime_get_boot_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_coarse_ns) { return ktime_get_coarse_ns(); } const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { .func = bpf_ktime_get_coarse_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_tai_ns) { /* NMI safe access to clock tai */ return ktime_get_tai_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = { .func = bpf_ktime_get_tai_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; if (unlikely(!task)) return -EINVAL; return (u64) task->tgid << 32 | task->pid; } const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { .func = bpf_get_current_pid_tgid, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_current_uid_gid) { struct task_struct *task = current; kuid_t uid; kgid_t gid; if (unlikely(!task)) return -EINVAL; current_uid_gid(&uid, &gid); return (u64) from_kgid(&init_user_ns, gid) << 32 | from_kuid(&init_user_ns, uid); } const struct bpf_func_proto bpf_get_current_uid_gid_proto = { .func = bpf_get_current_uid_gid, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) { struct task_struct *task = current; if (unlikely(!task)) goto err_clear; /* Verifier guarantees that size > 0 */ strscpy_pad(buf, task->comm, size); return 0; err_clear: memset(buf, 0, size); return -EINVAL; } const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE, }; #if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) { arch_spinlock_t *l = (void *)lock; union { __u32 val; arch_spinlock_t lock; } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); preempt_disable(); arch_spin_lock(l); } static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) { arch_spinlock_t *l = (void *)lock; arch_spin_unlock(l); preempt_enable(); } #else static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) { atomic_t *l = (void *)lock; BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); do { atomic_cond_read_relaxed(l, !VAL); } while (atomic_xchg(l, 1)); } static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) { atomic_t *l = (void *)lock; atomic_set_release(l, 0); } #endif static DEFINE_PER_CPU(unsigned long, irqsave_flags); static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) { unsigned long flags; local_irq_save(flags); __bpf_spin_lock(lock); __this_cpu_write(irqsave_flags, flags); } NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) { __bpf_spin_lock_irqsave(lock); return 0; } const struct bpf_func_proto bpf_spin_lock_proto = { .func = bpf_spin_lock, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, .arg1_btf_id = BPF_PTR_POISON, }; static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) { unsigned long flags; flags = __this_cpu_read(irqsave_flags); __bpf_spin_unlock(lock); local_irq_restore(flags); } NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) { __bpf_spin_unlock_irqrestore(lock); return 0; } const struct bpf_func_proto bpf_spin_unlock_proto = { .func = bpf_spin_unlock, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, .arg1_btf_id = BPF_PTR_POISON, }; void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src) { struct bpf_spin_lock *lock; if (lock_src) lock = src + map->record->spin_lock_off; else lock = dst + map->record->spin_lock_off; preempt_disable(); __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); __bpf_spin_unlock_irqrestore(lock); preempt_enable(); } BPF_CALL_0(bpf_jiffies64) { return get_jiffies_64(); } const struct bpf_func_proto bpf_jiffies64_proto = { .func = bpf_jiffies64, .gpl_only = false, .ret_type = RET_INTEGER, }; #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp; u64 cgrp_id; rcu_read_lock(); cgrp = task_dfl_cgroup(current); cgrp_id = cgroup_id(cgrp); rcu_read_unlock(); return cgrp_id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .func = bpf_get_current_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) { struct cgroup *cgrp; struct cgroup *ancestor; u64 cgrp_id; rcu_read_lock(); cgrp = task_dfl_cgroup(current); ancestor = cgroup_ancestor(cgrp, ancestor_level); cgrp_id = ancestor ? cgroup_id(ancestor) : 0; rcu_read_unlock(); return cgrp_id; } const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { .func = bpf_get_current_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; #endif /* CONFIG_CGROUPS */ #define BPF_STRTOX_BASE_MASK 0x1F static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, unsigned long long *res, bool *is_negative) { unsigned int base = flags & BPF_STRTOX_BASE_MASK; const char *cur_buf = buf; size_t cur_len = buf_len; unsigned int consumed; size_t val_len; char str[64]; if (!buf || !buf_len || !res || !is_negative) return -EINVAL; if (base != 0 && base != 8 && base != 10 && base != 16) return -EINVAL; if (flags & ~BPF_STRTOX_BASE_MASK) return -EINVAL; while (cur_buf < buf + buf_len && isspace(*cur_buf)) ++cur_buf; *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); if (*is_negative) ++cur_buf; consumed = cur_buf - buf; cur_len -= consumed; if (!cur_len) return -EINVAL; cur_len = min(cur_len, sizeof(str) - 1); memcpy(str, cur_buf, cur_len); str[cur_len] = '\0'; cur_buf = str; cur_buf = _parse_integer_fixup_radix(cur_buf, &base); val_len = _parse_integer(cur_buf, base, res); if (val_len & KSTRTOX_OVERFLOW) return -ERANGE; if (val_len == 0) return -EINVAL; cur_buf += val_len; consumed += cur_buf - str; return consumed; } static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, long long *res) { unsigned long long _res; bool is_negative; int err; err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); if (err < 0) return err; if (is_negative) { if ((long long)-_res > 0) return -ERANGE; *res = -_res; } else { if ((long long)_res < 0) return -ERANGE; *res = _res; } return err; } BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, s64 *, res) { long long _res; int err; *res = 0; err = __bpf_strtoll(buf, buf_len, flags, &_res); if (err < 0) return err; *res = _res; return err; } const struct bpf_func_proto bpf_strtol_proto = { .func = bpf_strtol, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(s64), }; BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, u64 *, res) { unsigned long long _res; bool is_negative; int err; *res = 0; err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); if (err < 0) return err; if (is_negative) return -EINVAL; *res = _res; return err; } const struct bpf_func_proto bpf_strtoul_proto = { .func = bpf_strtoul, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) { return strncmp(s1, s2, s1_sz); } static const struct bpf_func_proto bpf_strncmp_proto = { .func = bpf_strncmp, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_CONST_STR, }; BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, struct bpf_pidns_info *, nsdata, u32, size) { struct task_struct *task = current; struct pid_namespace *pidns; int err = -EINVAL; if (unlikely(size != sizeof(struct bpf_pidns_info))) goto clear; if (unlikely((u64)(dev_t)dev != dev)) goto clear; if (unlikely(!task)) goto clear; pidns = task_active_pid_ns(task); if (unlikely(!pidns)) { err = -ENOENT; goto clear; } if (!ns_match(&pidns->ns, (dev_t)dev, ino)) goto clear; nsdata->pid = task_pid_nr_ns(task, pidns); nsdata->tgid = task_tgid_nr_ns(task, pidns); return 0; clear: memset((void *)nsdata, 0, (size_t) size); return err; } const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .func = bpf_get_ns_current_pid_tgid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { .func = bpf_get_raw_cpu_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, void *, data, u64, size) { if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; return bpf_event_output(map, flags, data, size, NULL, 0, NULL); } const struct bpf_func_proto bpf_event_output_data_proto = { .func = bpf_event_output_data, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, const void __user *, user_ptr) { int ret = copy_from_user(dst, user_ptr, size); if (unlikely(ret)) { memset(dst, 0, size); ret = -EFAULT; } return ret; } const struct bpf_func_proto bpf_copy_from_user_proto = { .func = bpf_copy_from_user, .gpl_only = false, .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, const void __user *, user_ptr, struct task_struct *, tsk, u64, flags) { int ret; /* flags is not used yet */ if (unlikely(flags)) return -EINVAL; if (unlikely(!size)) return 0; ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0); if (ret == size) return 0; memset(dst, 0, size); /* Return -EFAULT for partial read */ return ret < 0 ? ret : -EFAULT; } const struct bpf_func_proto bpf_copy_from_user_task_proto = { .func = bpf_copy_from_user_task, .gpl_only = true, .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_BTF_ID, .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg5_type = ARG_ANYTHING }; BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) { if (cpu >= nr_cpu_ids) return (unsigned long)NULL; return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu); } const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) { return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr); } const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .func = bpf_this_cpu_ptr, .gpl_only = false, .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, size_t bufsz) { void __user *user_ptr = (__force void __user *)unsafe_ptr; buf[0] = 0; switch (fmt_ptype) { case 's': #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)unsafe_ptr < TASK_SIZE) return strncpy_from_user_nofault(buf, user_ptr, bufsz); fallthrough; #endif case 'k': return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); case 'u': return strncpy_from_user_nofault(buf, user_ptr, bufsz); } return -EINVAL; } /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; preempt_disable(); nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); return -EBUSY; } *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); return 0; } void bpf_put_buffers(void) { if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) { if (!data->bin_args && !data->buf) return; bpf_put_buffers(); } /* * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * * Returns a negative value if fmt is an invalid format string or 0 otherwise. * * This can be used in two ways: * - Format string verification only: when data->get_bin_args is false * - Arguments preparation: in addition to the above verification, it writes in * data->bin_args a binary representation of arguments usable by bstr_printf * where pointers from BPF have been sanitized. * * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; struct bpf_bprintf_buffers *buffers = NULL; size_t sizeof_cur_arg, sizeof_cur_ip; int err, i, num_spec = 0; u64 cur_arg; char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX"; fmt_end = strnchr(fmt, fmt_size, 0); if (!fmt_end) return -EINVAL; fmt_size = fmt_end - fmt; if (get_buffers && bpf_try_get_buffers(&buffers)) return -EBUSY; if (data->get_bin_args) { if (num_args) tmp_buf = buffers->bin_args; tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS; data->bin_args = (u32 *)tmp_buf; } if (data->get_buf) data->buf = buffers->buf; for (i = 0; i < fmt_size; i++) { if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { err = -EINVAL; goto out; } if (fmt[i] != '%') continue; if (fmt[i + 1] == '%') { i++; continue; } if (num_spec >= num_args) { err = -EINVAL; goto out; } /* The string is zero-terminated so if fmt[i] != 0, we can * always access fmt[i + 1], in the worst case it will be a 0 */ i++; /* skip optional "[0 +-][num]" width formatting field */ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || fmt[i] == ' ') i++; if (fmt[i] >= '1' && fmt[i] <= '9') { i++; while (fmt[i] >= '0' && fmt[i] <= '9') i++; } if (fmt[i] == 'p') { sizeof_cur_arg = sizeof(long); if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || ispunct(fmt[i + 1])) { if (tmp_buf) cur_arg = raw_args[num_spec]; goto nocopy_fmt; } if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && fmt[i + 2] == 's') { fmt_ptype = fmt[i + 1]; i += 2; goto fmt_str; } if (fmt[i + 1] == 'K' || fmt[i + 1] == 'x' || fmt[i + 1] == 's' || fmt[i + 1] == 'S') { if (tmp_buf) cur_arg = raw_args[num_spec]; i++; goto nocopy_fmt; } if (fmt[i + 1] == 'B') { if (tmp_buf) { err = snprintf(tmp_buf, (tmp_buf_end - tmp_buf), "%pB", (void *)(long)raw_args[num_spec]); tmp_buf += (err + 1); } i++; num_spec++; continue; } /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { err = -EINVAL; goto out; } i += 2; if (!tmp_buf) goto nocopy_fmt; sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16; if (tmp_buf_end - tmp_buf < sizeof_cur_ip) { err = -ENOSPC; goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; err = copy_from_kernel_nofault(cur_ip, unsafe_ptr, sizeof_cur_ip); if (err < 0) memset(cur_ip, 0, sizeof_cur_ip); /* hack: bstr_printf expects IP addresses to be * pre-formatted as strings, ironically, the easiest way * to do that is to call snprintf. */ ip_spec[2] = fmt[i - 1]; ip_spec[3] = fmt[i]; err = snprintf(tmp_buf, tmp_buf_end - tmp_buf, ip_spec, &cur_ip); tmp_buf += err + 1; num_spec++; continue; } else if (fmt[i] == 's') { fmt_ptype = fmt[i]; fmt_str: if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && !ispunct(fmt[i + 1])) { err = -EINVAL; goto out; } if (!tmp_buf) goto nocopy_fmt; if (tmp_buf_end == tmp_buf) { err = -ENOSPC; goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, fmt_ptype, tmp_buf_end - tmp_buf); if (err < 0) { tmp_buf[0] = '\0'; err = 1; } tmp_buf += err; num_spec++; continue; } else if (fmt[i] == 'c') { if (!tmp_buf) goto nocopy_fmt; if (tmp_buf_end == tmp_buf) { err = -ENOSPC; goto out; } *tmp_buf = raw_args[num_spec]; tmp_buf++; num_spec++; continue; } sizeof_cur_arg = sizeof(int); if (fmt[i] == 'l') { sizeof_cur_arg = sizeof(long); i++; } if (fmt[i] == 'l') { sizeof_cur_arg = sizeof(long long); i++; } if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x' && fmt[i] != 'X') { err = -EINVAL; goto out; } if (tmp_buf) cur_arg = raw_args[num_spec]; nocopy_fmt: if (tmp_buf) { tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32)); if (tmp_buf_end - tmp_buf < sizeof_cur_arg) { err = -ENOSPC; goto out; } if (sizeof_cur_arg == 8) { *(u32 *)tmp_buf = *(u32 *)&cur_arg; *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1); } else { *(u32 *)tmp_buf = (u32)(long)cur_arg; } tmp_buf += sizeof_cur_arg; } num_spec++; } err = 0; out: if (err) bpf_bprintf_cleanup(data); return err; } BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, const void *, args, u32, data_len) { struct bpf_bprintf_data data = { .get_bin_args = true, }; int err, num_args; if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 || (data_len && !args)) return -EINVAL; num_args = data_len / 8; /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we * can safely give an unbounded size. */ err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data); if (err < 0) return err; err = bstr_printf(str, str_size, fmt, data.bin_args); bpf_bprintf_cleanup(&data); return err + 1; } const struct bpf_func_proto bpf_snprintf_proto = { .func = bpf_snprintf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM_OR_NULL, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_CONST_STR, .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) { if (map->map_type == BPF_MAP_TYPE_ARRAY) { struct bpf_array *array = container_of(map, struct bpf_array, map); *arr_idx = ((char *)value - array->value) / array->elem_size; return arr_idx; } return (void *)value - round_up(map->key_size, 8); } struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; void __rcu *callback_fn; void *value; union { struct rcu_head rcu; struct work_struct delete_work; }; u64 flags; }; /* BPF map elements can contain 'struct bpf_timer'. * Such map owns all of its BPF timers. * 'struct bpf_timer' is allocated as part of map element allocation * and it's zero initialized. * That space is used to keep 'struct bpf_async_kern'. * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and * remembers 'struct bpf_map *' pointer it's part of. * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. * bpf_timer_start() arms the timer. * If user space reference to a map goes to zero at this point * ops->map_release_uref callback is responsible for cancelling the timers, * freeing their memory, and decrementing prog's refcnts. * bpf_timer_cancel() cancels the timer and decrements prog's refcnt. * Inner maps can contain bpf timers as well. ops->map_release_uref is * freeing the timers when inner map is replaced or deleted by user space. */ struct bpf_hrtimer { struct bpf_async_cb cb; struct hrtimer timer; atomic_t cancelling; }; struct bpf_work { struct bpf_async_cb cb; struct work_struct work; struct work_struct delete_work; }; /* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */ struct bpf_async_kern { union { struct bpf_async_cb *cb; struct bpf_hrtimer *timer; struct bpf_work *work; }; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. */ struct bpf_spin_lock lock; } __attribute__((aligned(8))); enum bpf_async_type { BPF_ASYNC_TYPE_TIMER = 0, BPF_ASYNC_TYPE_WQ, }; static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); struct bpf_map *map = t->cb.map; void *value = t->cb.value; bpf_callback_t callback_fn; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_timer); callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and * cannot be preempted by another bpf_timer_cb() on the same cpu. * Remember the timer this callback is servicing to prevent * deadlock if callback_fn() calls bpf_timer_cancel() or * bpf_map_delete_elem() on the same timer. */ this_cpu_write(hrtimer_running, t); key = map_key_from_value(map, value, &idx); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); /* The verifier checked that return value is zero. */ this_cpu_write(hrtimer_running, NULL); out: return HRTIMER_NORESTART; } static void bpf_wq_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, work); struct bpf_async_cb *cb = &w->cb; struct bpf_map *map = cb->map; bpf_callback_t callback_fn; void *value = cb->value; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_wq); callback_fn = READ_ONCE(cb->callback_fn); if (!callback_fn) return; key = map_key_from_value(map, value, &idx); rcu_read_lock_trace(); migrate_disable(); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); migrate_enable(); rcu_read_unlock_trace(); } static void bpf_async_cb_rcu_free(struct rcu_head *rcu) { struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); kfree_nolock(cb); } static void bpf_wq_delete_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, delete_work); cancel_work_sync(&w->work); call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free); } static void bpf_timer_delete_work(struct work_struct *work) { struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call * call_rcu() right after for both preallocated and non-preallocated * maps. The async->cb = NULL was already done and no code path can see * address 't' anymore. Timer if armed for existing bpf_hrtimer before * bpf_timer_cancel_and_free will have been cancelled. */ hrtimer_cancel(&t->timer); call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); } static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { struct bpf_async_cb *cb; struct bpf_hrtimer *t; struct bpf_work *w; clockid_t clockid; size_t size; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; switch (type) { case BPF_ASYNC_TYPE_TIMER: size = sizeof(struct bpf_hrtimer); break; case BPF_ASYNC_TYPE_WQ: size = sizeof(struct bpf_work); break; default: return -EINVAL; } __bpf_spin_lock_irqsave(&async->lock); t = async->timer; if (t) { ret = -EBUSY; goto out; } cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; } switch (type) { case BPF_ASYNC_TYPE_TIMER: clockid = flags & (MAX_CLOCKS - 1); t = (struct bpf_hrtimer *)cb; atomic_set(&t->cancelling, 0); INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; case BPF_ASYNC_TYPE_WQ: w = (struct bpf_work *)cb; INIT_WORK(&w->work, bpf_wq_work); INIT_WORK(&w->delete_work, bpf_wq_delete_work); cb->value = (void *)async - map->record->wq_off; break; } cb->map = map; cb->prog = NULL; cb->flags = flags; rcu_assign_pointer(cb->callback_fn, NULL); WRITE_ONCE(async->cb, cb); /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL * timer or atomic64_read() below returns a zero usercnt. */ smp_mb(); if (!atomic64_read(&map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. */ WRITE_ONCE(async->cb, NULL); kfree_nolock(cb); ret = -EPERM; } out: __bpf_spin_unlock_irqrestore(&async->lock); return ret; } BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, u64, flags) { clock_t clockid = flags & (MAX_CLOCKS - 1); BUILD_BUG_ON(MAX_CLOCKS != 16); BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); if (flags >= MAX_CLOCKS || /* similar to timerfd except _ALARM variants are not supported */ (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && clockid != CLOCK_BOOTTIME)) return -EINVAL; return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER); } static const struct bpf_func_proto bpf_timer_init_proto = { .func = bpf_timer_init, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, struct bpf_prog_aux *aux, unsigned int flags, enum bpf_async_type type) { struct bpf_prog *prev, *prog = aux->prog; struct bpf_async_cb *cb; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; __bpf_spin_lock_irqsave(&async->lock); cb = async->cb; if (!cb) { ret = -EINVAL; goto out; } if (!atomic64_read(&cb->map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. Otherwise timer might still be * running even when bpf prog is detached and user space * is gone, since map_release_uref won't ever be called. */ ret = -EPERM; goto out; } prev = cb->prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() * can pick different callback_fn-s within the same prog. */ prog = bpf_prog_inc_not_zero(prog); if (IS_ERR(prog)) { ret = PTR_ERR(prog); goto out; } if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); cb->prog = prog; } rcu_assign_pointer(cb->callback_fn, callback_fn); out: __bpf_spin_unlock_irqrestore(&async->lock); return ret; } BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, struct bpf_prog_aux *, aux) { return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER); } static const struct bpf_func_proto bpf_timer_set_callback_proto = { .func = bpf_timer_set_callback, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_PTR_TO_FUNC, }; BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; int ret = 0; enum hrtimer_mode mode; if (in_nmi()) return -EOPNOTSUPP; if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; if (!t || !t->cb.prog) { ret = -EINVAL; goto out; } if (flags & BPF_F_TIMER_ABS) mode = HRTIMER_MODE_ABS_SOFT; else mode = HRTIMER_MODE_REL_SOFT; if (flags & BPF_F_TIMER_CPU_PIN) mode |= HRTIMER_MODE_PINNED; hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; } static const struct bpf_func_proto bpf_timer_start_proto = { .func = bpf_timer_start, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static void drop_prog_refcnt(struct bpf_async_cb *async) { struct bpf_prog *prog = async->prog; if (prog) { bpf_prog_put(prog); async->prog = NULL; rcu_assign_pointer(async->callback_fn, NULL); } } BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { struct bpf_hrtimer *t, *cur_t; bool inc = false; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; rcu_read_lock(); __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; if (!t) { ret = -EINVAL; goto out; } cur_t = this_cpu_read(hrtimer_running); if (cur_t == t) { /* If bpf callback_fn is trying to bpf_timer_cancel() * its own timer the hrtimer_cancel() will deadlock * since it waits for callback_fn to finish. */ ret = -EDEADLK; goto out; } /* Only account in-flight cancellations when invoked from a timer * callback, since we want to avoid waiting only if other _callbacks_ * are waiting on us, to avoid introducing lockups. Non-callback paths * are ok, since nobody would synchronously wait for their completion. */ if (!cur_t) goto drop; atomic_inc(&t->cancelling); /* Need full barrier after relaxed atomic_inc */ smp_mb__after_atomic(); inc = true; if (atomic_read(&cur_t->cancelling)) { /* We're cancelling timer t, while some other timer callback is * attempting to cancel us. In such a case, it might be possible * that timer t belongs to the other callback, or some other * callback waiting upon it (creating transitive dependencies * upon us), and we will enter a deadlock if we continue * cancelling and waiting for it synchronously, since it might * do the same. Bail! */ ret = -EDEADLK; goto out; } drop: drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); if (inc) atomic_dec(&t->cancelling); rcu_read_unlock(); return ret; } static const struct bpf_func_proto bpf_timer_cancel_proto = { .func = bpf_timer_cancel, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, }; static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) { struct bpf_async_cb *cb; /* Performance optimization: read async->cb without lock first. */ if (!READ_ONCE(async->cb)) return NULL; __bpf_spin_lock_irqsave(&async->lock); /* re-read it under lock */ cb = async->cb; if (!cb) goto out; drop_prog_refcnt(cb); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ WRITE_ONCE(async->cb, NULL); out: __bpf_spin_unlock_irqrestore(&async->lock); return cb; } /* This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_timer_cancel_and_free(void *val) { struct bpf_hrtimer *t; t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); if (!t) return; /* We check that bpf_map_delete/update_elem() was called from timer * callback_fn. In such case we don't call hrtimer_cancel() (since it * will deadlock) and don't call hrtimer_try_to_cancel() (since it will * just return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. * * However, it is possible the timer callback_fn calling us armed the * timer _before_ calling us, such that failing to cancel it here will * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. * Therefore, we _need_ to cancel any outstanding timers before we do * call_rcu, even though no more timers can be armed. * * Moreover, we need to schedule work even if timer does not belong to * the calling callback_fn, as on two different CPUs, we can end up in a * situation where both sides run in parallel, try to cancel one * another, and we end up waiting on both sides in hrtimer_cancel * without making forward progress, since timer1 depends on time2 * callback to finish, and vice versa. * * CPU 1 (timer1_cb) CPU 2 (timer2_cb) * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1) * * To avoid these issues, punt to workqueue context when we are in a * timer callback. */ if (this_cpu_read(hrtimer_running)) { queue_work(system_dfl_wq, &t->cb.delete_work); return; } if (IS_ENABLED(CONFIG_PREEMPT_RT)) { /* If the timer is running on other CPU, also use a kworker to * wait for the completion of the timer instead of trying to * acquire a sleepable lock in hrtimer_cancel() to wait for its * completion. */ if (hrtimer_try_to_cancel(&t->timer) >= 0) call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); else queue_work(system_dfl_wq, &t->cb.delete_work); } else { bpf_timer_delete_work(&t->cb.delete_work); } } /* This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_wq_cancel_and_free(void *val) { struct bpf_work *work; BTF_TYPE_EMIT(struct bpf_wq); work = (struct bpf_work *)__bpf_async_cancel_and_free(val); if (!work) return; /* Trigger cancel of the sleepable work, but *do not* wait for * it to finish if it was running as we might not be in a * sleepable context. * kfree will be called once the work has finished. */ schedule_work(&work->delete_work); } BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr) { unsigned long *kptr = dst; /* This helper may be inlined by verifier. */ return xchg(kptr, (unsigned long)ptr); } /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to * denote type that verifier will determine. */ static const struct bpf_func_proto bpf_kptr_xchg_proto = { .func = bpf_kptr_xchg, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = BPF_PTR_POISON, .arg1_type = ARG_KPTR_XCHG_DEST, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, .arg2_btf_id = BPF_PTR_POISON, }; struct bpf_dynptr_file_impl { struct freader freader; /* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */ u64 offset; u64 size; }; /* Since the upper 8 bits of dynptr->size is reserved, the * maximum supported size is 2^24 - 1. */ #define DYNPTR_MAX_SIZE ((1UL << 24) - 1) #define DYNPTR_TYPE_SHIFT 28 #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { ptr->size |= DYNPTR_RDONLY_BIT; } static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) { ptr->size |= type << DYNPTR_TYPE_SHIFT; } static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr) { return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { struct bpf_dynptr_file_impl *df = ptr->data; return df->size; } return ptr->size & DYNPTR_SIZE_MASK; } static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off) { if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { struct bpf_dynptr_file_impl *df = ptr->data; df->offset += off; return; } ptr->offset += off; } static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size) { u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { struct bpf_dynptr_file_impl *df = ptr->data; df->size = new_size; return; } ptr->size = (u32)new_size | metadata; } int bpf_dynptr_check_size(u64 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; } static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len) { const void *ptr; if (!buf) return -EINVAL; df->freader.buf = buf; df->freader.buf_sz = len; ptr = freader_fetch(&df->freader, offset + df->offset, len); if (!ptr) return df->freader.err; if (ptr != buf) /* Force copying into the buffer */ memcpy(buf, ptr, len); return 0; } void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size) { ptr->data = data; ptr->offset = offset; ptr->size = size; bpf_dynptr_set_type(ptr, type); } void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) { memset(ptr, 0, sizeof(*ptr)); } BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; BTF_TYPE_EMIT(struct bpf_dynptr); err = bpf_dynptr_check_size(size); if (err) goto error; /* flags is currently unsupported */ if (flags) { err = -EINVAL; goto error; } bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size); return 0; error: bpf_dynptr_set_null(ptr); return err; } static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .func = bpf_dynptr_from_mem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src, u64 offset, u64 flags) { enum bpf_dynptr_type type; int err; if (!src->data || flags) return -EINVAL; err = bpf_dynptr_check_off_len(src, offset, len); if (err) return err; type = bpf_dynptr_get_type(src); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: /* Source and destination may possibly overlap, hence use memmove to * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr * pointing to overlapping PTR_TO_MAP_VALUE regions. */ memmove(dst, src->data + src->offset + offset, len); return 0; case BPF_DYNPTR_TYPE_SKB: return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_SKB_META: memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); return 0; case BPF_DYNPTR_TYPE_FILE: return bpf_file_fetch_bytes(src->data, offset, dst, len); default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; } } BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src, u64, offset, u64, flags) { return __bpf_dynptr_read(dst, len, src, offset, flags); } static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, u64 len, u64 flags) { enum bpf_dynptr_type type; int err; if (!dst->data || __bpf_dynptr_is_rdonly(dst)) return -EINVAL; err = bpf_dynptr_check_off_len(dst, offset, len); if (err) return err; type = bpf_dynptr_get_type(dst); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: if (flags) return -EINVAL; /* Source and destination may possibly overlap, hence use memmove to * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr * pointing to overlapping PTR_TO_MAP_VALUE regions. */ memmove(dst->data + dst->offset + offset, src, len); return 0; case BPF_DYNPTR_TYPE_SKB: return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len, flags); case BPF_DYNPTR_TYPE_XDP: if (flags) return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); case BPF_DYNPTR_TYPE_SKB_META: return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src, len, flags); default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; } } BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src, u64, len, u64, flags) { return __bpf_dynptr_write(dst, offset, src, len, flags); } static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len) { enum bpf_dynptr_type type; int err; if (!ptr->data) return 0; err = bpf_dynptr_check_off_len(ptr, offset, len); if (err) return 0; if (__bpf_dynptr_is_rdonly(ptr)) return 0; type = bpf_dynptr_get_type(ptr); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: case BPF_DYNPTR_TYPE_XDP: case BPF_DYNPTR_TYPE_SKB_META: /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type); return 0; } } static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto bpf_perf_event_read_proto __weak; const struct bpf_func_proto bpf_send_signal_proto __weak; const struct bpf_func_proto bpf_send_signal_thread_proto __weak; const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak; const struct bpf_func_proto bpf_get_task_stack_proto __weak; const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: return &bpf_map_lookup_elem_proto; case BPF_FUNC_map_update_elem: return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; case BPF_FUNC_map_push_elem: return &bpf_map_push_elem_proto; case BPF_FUNC_map_pop_elem: return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; case BPF_FUNC_map_lookup_percpu_elem: return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_raw_smp_processor_id_proto; case BPF_FUNC_get_numa_node_id: return &bpf_get_numa_node_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_ktime_get_tai_ns: return &bpf_ktime_get_tai_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: return &bpf_ringbuf_reserve_proto; case BPF_FUNC_ringbuf_submit: return &bpf_ringbuf_submit_proto; case BPF_FUNC_ringbuf_discard: return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; case BPF_FUNC_strtol: return &bpf_strtol_proto; case BPF_FUNC_strtoul: return &bpf_strtoul_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; default: break; } if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return NULL; switch (func_id) { case BPF_FUNC_spin_lock: return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; case BPF_FUNC_timer_init: return &bpf_timer_init_proto; case BPF_FUNC_timer_set_callback: return &bpf_timer_set_callback_proto; case BPF_FUNC_timer_start: return &bpf_timer_start_proto; case BPF_FUNC_timer_cancel: return &bpf_timer_cancel_proto; case BPF_FUNC_kptr_xchg: return &bpf_kptr_xchg_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; case BPF_FUNC_user_ringbuf_drain: return &bpf_user_ringbuf_drain_proto; case BPF_FUNC_ringbuf_reserve_dynptr: return &bpf_ringbuf_reserve_dynptr_proto; case BPF_FUNC_ringbuf_submit_dynptr: return &bpf_ringbuf_submit_dynptr_proto; case BPF_FUNC_ringbuf_discard_dynptr: return &bpf_ringbuf_discard_dynptr_proto; case BPF_FUNC_dynptr_from_mem: return &bpf_dynptr_from_mem_proto; case BPF_FUNC_dynptr_read: return &bpf_dynptr_read_proto; case BPF_FUNC_dynptr_write: return &bpf_dynptr_write_proto; case BPF_FUNC_dynptr_data: return &bpf_dynptr_data_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_cgrp_storage_get: return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: return &bpf_cgrp_storage_delete_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_get_current_ancestor_cgroup_id: return &bpf_get_current_ancestor_cgroup_id_proto; case BPF_FUNC_current_task_under_cgroup: return &bpf_current_task_under_cgroup_proto; #endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif case BPF_FUNC_task_storage_get: if (bpf_prog_check_recur(prog)) return &bpf_task_storage_get_recur_proto; return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: if (bpf_prog_check_recur(prog)) return &bpf_task_storage_delete_recur_proto; return &bpf_task_storage_delete_proto; default: break; } if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; switch (func_id) { case BPF_FUNC_trace_printk: return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; case BPF_FUNC_copy_from_user: return &bpf_copy_from_user_proto; case BPF_FUNC_copy_from_user_task: return &bpf_copy_from_user_task_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; case BPF_FUNC_task_pt_regs: return &bpf_task_pt_regs_proto; case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); case BPF_FUNC_perf_event_read_value: return bpf_get_perf_event_read_value_proto(); case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; case BPF_FUNC_send_signal: return &bpf_send_signal_proto; case BPF_FUNC_send_signal_thread: return &bpf_send_signal_thread_proto; case BPF_FUNC_get_task_stack: return prog->sleepable ? &bpf_get_task_stack_sleepable_proto : &bpf_get_task_stack_proto; case BPF_FUNC_get_branch_snapshot: return &bpf_get_branch_snapshot_proto; case BPF_FUNC_find_vma: return &bpf_find_vma_proto; default: return NULL; } } EXPORT_SYMBOL_GPL(bpf_base_func_proto); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { struct list_head *head = list_head, *orig_head = list_head; BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); /* Do the actual list draining outside the lock to not hold the lock for * too long, and also prevent deadlocks if tracing programs end up * executing on entry/exit of functions called inside the critical * section, and end up doing map ops that call bpf_list_head_free for * the same map value again. */ __bpf_spin_lock_irqsave(spin_lock); if (!head->next || list_empty(head)) goto unlock; head = head->next; unlock: INIT_LIST_HEAD(orig_head); __bpf_spin_unlock_irqrestore(spin_lock); while (head != orig_head) { void *obj = head; obj -= field->graph_root.node_offset; head = head->next; /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); } } /* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are * 'rb_node *', so field name of rb_node within containing struct is not * needed. * * Since bpf_rb_tree's node type has a corresponding struct btf_field with * graph_root.node_offset, it's not necessary to know field name * or type of node struct */ #define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \ for (pos = rb_first_postorder(root); \ pos && ({ n = rb_next_postorder(pos); 1; }); \ pos = n) void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock) { struct rb_root_cached orig_root, *root = rb_root; struct rb_node *pos, *n; void *obj; BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root)); BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root)); __bpf_spin_lock_irqsave(spin_lock); orig_root = *root; *root = RB_ROOT_CACHED; __bpf_spin_unlock_irqrestore(spin_lock); bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { obj = pos; obj -= field->graph_root.node_offset; __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); } } __bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; u64 size = local_type_id__k; void *p; p = bpf_mem_alloc(&bpf_global_ma, size); if (!p) return NULL; if (meta) bpf_obj_init(meta->record, p); return p; } __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) { u64 size = local_type_id__k; /* The verifier has ensured that meta__ign must be NULL */ return bpf_mem_alloc(&bpf_global_percpu_ma, size); } /* Must be called under migrate_disable(), as required by bpf_mem_free */ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) { struct bpf_mem_alloc *ma; if (rec && rec->refcount_off >= 0 && !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) { /* Object is refcounted and refcount_dec didn't result in 0 * refcount. Return without freeing the object */ return; } if (rec) bpf_obj_free_fields(rec, p); if (percpu) ma = &bpf_global_percpu_ma; else ma = &bpf_global_ma; bpf_mem_free_rcu(ma, p); } __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; void *p = p__alloc; __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false); } __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) { /* The verifier has ensured that meta__ign must be NULL */ bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc); } __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; struct bpf_refcount *ref; /* Could just cast directly to refcount_t *, but need some code using * bpf_refcount type so that it is emitted in vmlinux BTF */ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off); if (!refcount_inc_not_zero((refcount_t *)ref)) return NULL; /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null * in verifier.c */ return (void *)p__refcounted_kptr; } static int __bpf_list_add(struct bpf_list_node_kern *node, struct bpf_list_head *head, bool tail, struct btf_record *rec, u64 off) { struct list_head *n = &node->list_head, *h = (void *)head; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); /* node->owner != NULL implies !list_empty(n), no need to separately * check the latter */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } tail ? list_add_tail(n, h) : list_add(n, h); WRITE_ONCE(node->owner, head); return 0; } __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); } static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) { struct list_head *n, *h = (void *)head; struct bpf_list_node_kern *node; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); if (list_empty(h)) return NULL; n = tail ? h->prev : h->next; node = container_of(n, struct bpf_list_node_kern, list_head); if (WARN_ON_ONCE(READ_ONCE(node->owner) != head)) return NULL; list_del_init(n); WRITE_ONCE(node->owner, NULL); return (struct bpf_list_node *)n; } __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) { return __bpf_list_del(head, false); } __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) { return __bpf_list_del(head, true); } __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) { struct list_head *h = (struct list_head *)head; if (list_empty(h) || unlikely(!h->next)) return NULL; return (struct bpf_list_node *)h->next; } __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) { struct list_head *h = (struct list_head *)head; if (list_empty(h) || unlikely(!h->next)) return NULL; return (struct bpf_list_node *)h->prev; } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; struct rb_root_cached *r = (struct rb_root_cached *)root; struct rb_node *n = &node_internal->rb_node; /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or * n is owned by some other tree. No need to check RB_EMPTY_NODE(n) */ if (READ_ONCE(node_internal->owner) != root) return NULL; rb_erase_cached(n, r); RB_CLEAR_NODE(n); WRITE_ONCE(node_internal->owner, NULL); return (struct bpf_rb_node *)n; } /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF * program */ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node_kern *node, void *less, struct btf_record *rec, u64 off) { struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; struct rb_node *parent = NULL, *n = &node->rb_node; bpf_callback_t cb = (bpf_callback_t)less; bool leftmost = true; /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately * check the latter */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } while (*link) { parent = *link; if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(n, parent, link); rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost); WRITE_ONCE(node->owner, root); return 0; } __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), void *meta__ign, u64 off) { struct btf_struct_meta *meta = meta__ign; struct bpf_rb_node_kern *n = (void *)node; return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) { struct rb_root_cached *r = (struct rb_root_cached *)root; return (struct bpf_rb_node *)rb_first_cached(r); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root) { struct rb_root_cached *r = (struct rb_root_cached *)root; return (struct bpf_rb_node *)r->rb_root.rb_node; } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node) { struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; if (READ_ONCE(node_internal->owner) != root) return NULL; return (struct bpf_rb_node *)node_internal->rb_node.rb_left; } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node) { struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; if (READ_ONCE(node_internal->owner) != root) return NULL; return (struct bpf_rb_node *)node_internal->rb_node.rb_right; } /** * bpf_task_acquire - Acquire a reference to a task. A task acquired by this * kfunc which is not stored in a map as a kptr, must be released by calling * bpf_task_release(). * @p: The task on which a reference is being acquired. */ __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p) { if (refcount_inc_not_zero(&p->rcu_users)) return p; return NULL; } /** * bpf_task_release - Release the reference acquired on a task. * @p: The task on which a reference is being released. */ __bpf_kfunc void bpf_task_release(struct task_struct *p) { put_task_struct_rcu_user(p); } __bpf_kfunc void bpf_task_release_dtor(void *p) { put_task_struct_rcu_user(p); } CFI_NOSEAL(bpf_task_release_dtor); #ifdef CONFIG_CGROUPS /** * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by * this kfunc which is not stored in a map as a kptr, must be released by * calling bpf_cgroup_release(). * @cgrp: The cgroup on which a reference is being acquired. */ __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) { return cgroup_tryget(cgrp) ? cgrp : NULL; } /** * bpf_cgroup_release - Release the reference acquired on a cgroup. * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to * not be freed until the current grace period has ended, even if its refcount * drops to 0. * @cgrp: The cgroup on which a reference is being released. */ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp) { cgroup_put(cgrp); } __bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp) { cgroup_put(cgrp); } CFI_NOSEAL(bpf_cgroup_release_dtor); /** * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor * array. A cgroup returned by this kfunc which is not subsequently stored in a * map, must be released by calling bpf_cgroup_release(). * @cgrp: The cgroup for which we're performing a lookup. * @level: The level of ancestor to look up. */ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) { struct cgroup *ancestor; if (level > cgrp->level || level < 0) return NULL; /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */ ancestor = cgrp->ancestors[level]; if (!cgroup_tryget(ancestor)) return NULL; return ancestor; } /** * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this * kfunc which is not subsequently stored in a map, must be released by calling * bpf_cgroup_release(). * @cgid: cgroup id. */ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) { struct cgroup *cgrp; cgrp = __cgroup_get_from_id(cgid); if (IS_ERR(cgrp)) return NULL; return cgrp; } /** * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test * task's membership of cgroup ancestry. * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, struct cgroup *ancestor) { long ret; rcu_read_lock(); ret = task_under_cgroup_hierarchy(task, ancestor); rcu_read_unlock(); return ret; } BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; return task_under_cgroup_hierarchy(current, cgrp); } const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { .func = bpf_current_task_under_cgroup, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; /** * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its * hierarchy ID. * @task: The target task * @hierarchy_id: The ID of a cgroup1 hierarchy * * On success, the cgroup is returen. On failure, NULL is returned. */ __bpf_kfunc struct cgroup * bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) { struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id); if (IS_ERR(cgrp)) return NULL; return cgrp; } #endif /* CONFIG_CGROUPS */ /** * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up * in the root pid namespace idr. If a task is returned, it must either be * stored in a map, or released with bpf_task_release(). * @pid: The pid of the task being looked up. */ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) { struct task_struct *p; rcu_read_lock(); p = find_task_by_pid_ns(pid, &init_pid_ns); if (p) p = bpf_task_acquire(p); rcu_read_unlock(); return p; } /** * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up * in the pid namespace of the current task. If a task is returned, it must * either be stored in a map, or released with bpf_task_release(). * @vpid: The vpid of the task being looked up. */ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) { struct task_struct *p; rcu_read_lock(); p = find_task_by_vpid(vpid); if (p) p = bpf_task_acquire(p); rcu_read_unlock(); return p; } /** * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr * @buffer__opt: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * * If buffer__opt is NULL, the call will fail if buffer_opt was needed. * * If the intention is to write to the data slice, please use * bpf_dynptr_slice_rdwr. * * The user must check that the returned pointer is not null before using it. * * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice * does not change the underlying packet data pointers, so a call to * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in * the bpf program. * * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only * data slice (can be either direct pointer to the data or a pointer to the user * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; u64 len = buffer__szk; int err; if (!ptr->data) return NULL; err = bpf_dynptr_check_off_len(ptr, offset, len); if (err) return NULL; type = bpf_dynptr_get_type(ptr); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: if (buffer__opt) return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); else return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len); case BPF_DYNPTR_TYPE_XDP: { void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len); if (!IS_ERR_OR_NULL(xdp_ptr)) return xdp_ptr; if (!buffer__opt) return NULL; bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); return buffer__opt; } case BPF_DYNPTR_TYPE_SKB_META: return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); case BPF_DYNPTR_TYPE_FILE: err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk); return err ? NULL : buffer__opt; default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; } } /** * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr * @buffer__opt: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * * If buffer__opt is NULL, the call will fail if buffer_opt was needed. * * The returned pointer is writable and may point to either directly the dynptr * data at the requested offset or to the buffer if unable to obtain a direct * data pointer to (example: the requested slice is to the paged area of an skb * packet). In the case where the returned pointer is to the buffer, the user * is responsible for persisting writes through calling bpf_dynptr_write(). This * usually looks something like this pattern: * * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer)); * if (!eth) * return TC_ACT_SHOT; * * // mutate eth header // * * if (eth == buffer) * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0); * * Please note that, as in the example above, the user must check that the * returned pointer is not null before using it. * * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr * does not change the underlying packet data pointers, so a call to * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in * the bpf program. * * Return: NULL if the call failed (eg invalid dynptr), pointer to a * data slice (can be either direct pointer to the data or a pointer to the user * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data || __bpf_dynptr_is_rdonly(ptr)) return NULL; /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. * * For skb-type dynptrs, it is safe to write into the returned pointer * if the bpf program allows skb data writes. There are two possibilities * that may occur when calling bpf_dynptr_slice_rdwr: * * 1) The requested slice is in the head of the skb. In this case, the * returned pointer is directly to skb data, and if the skb is cloned, the * verifier will have uncloned it (see bpf_unclone_prologue()) already. * The pointer can be directly written into. * * 2) Some portion of the requested slice is in the paged buffer area. * In this case, the requested data will be copied out into the buffer * and the returned pointer will be a pointer to the buffer. The skb * will not be pulled. To persist the write, the user will need to call * bpf_dynptr_write(), which will pull the skb and commit the write. * * Similarly for xdp programs, if the requested slice is not across xdp * fragments, then a direct pointer will be returned, otherwise the data * will be copied out into the buffer and the user will need to call * bpf_dynptr_write() to commit changes. */ return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); } __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 size; if (!ptr->data || start > end) return -EINVAL; size = __bpf_dynptr_size(ptr); if (start > size || end > size) return -ERANGE; bpf_dynptr_advance_offset(ptr, start); bpf_dynptr_set_size(ptr, end - start); return 0; } __bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; return !ptr->data; } __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return false; return __bpf_dynptr_is_rdonly(ptr); } __bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return -EINVAL; return __bpf_dynptr_size(ptr); } __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, struct bpf_dynptr *clone__uninit) { struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit; struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) { bpf_dynptr_set_null(clone); return -EINVAL; } *clone = *ptr; return 0; } /** * bpf_dynptr_copy() - Copy data from one dynptr to another. * @dst_ptr: Destination dynptr - where data should be copied to * @dst_off: Offset into the destination dynptr * @src_ptr: Source dynptr - where data should be copied from * @src_off: Offset into the source dynptr * @size: Length of the data to copy from source to destination * * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; u64 off; src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); if (src_slice && dst_slice) { memmove(dst_slice, src_slice, size); return 0; } if (src_slice) return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0); if (dst_slice) return __bpf_dynptr_read(dst_slice, size, src, src_off, 0); if (bpf_dynptr_check_off_len(dst, dst_off, size) || bpf_dynptr_check_off_len(src, src_off, size)) return -E2BIG; off = 0; while (off < size) { u64 chunk_sz = min_t(u64, sizeof(buf), size - off); int err; err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); if (err) return err; err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0); if (err) return err; off += chunk_sz; } return 0; } /** * bpf_dynptr_memset() - Fill dynptr memory with a constant byte. * @p: Destination dynptr - where data will be filled * @offset: Offset into the dynptr to start filling from * @size: Number of bytes to fill * @val: Constant byte to fill the memory with * * Fills the @size bytes of the memory area pointed to by @p * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 chunk_sz, write_off; char buf[256]; void* slice; int err; slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size); if (likely(slice)) { memset(slice, val, size); return 0; } if (__bpf_dynptr_is_rdonly(ptr)) return -EINVAL; err = bpf_dynptr_check_off_len(ptr, offset, size); if (err) return err; /* Non-linear data under the dynptr, write from a local buffer */ chunk_sz = min_t(u64, sizeof(buf), size); memset(buf, val, chunk_sz); for (write_off = 0; write_off < size; write_off += chunk_sz) { chunk_sz = min_t(u64, sizeof(buf), size - write_off); err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); if (err) return err; } return 0; } __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; } __bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k) { return (void *)obj__ign; } __bpf_kfunc void bpf_rcu_read_lock(void) { rcu_read_lock(); } __bpf_kfunc void bpf_rcu_read_unlock(void) { rcu_read_unlock(); } struct bpf_throw_ctx { struct bpf_prog_aux *aux; u64 sp; u64 bp; int cnt; }; static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) { struct bpf_throw_ctx *ctx = cookie; struct bpf_prog *prog; /* * The RCU read lock is held to safely traverse the latch tree, but we * don't need its protection when accessing the prog, since it has an * active stack frame on the current stack trace, and won't disappear. */ rcu_read_lock(); prog = bpf_prog_ksym_find(ip); rcu_read_unlock(); if (!prog) return !ctx->cnt; ctx->cnt++; if (bpf_is_subprog(prog)) return true; ctx->aux = prog->aux; ctx->sp = sp; ctx->bp = bp; return false; } __bpf_kfunc void bpf_throw(u64 cookie) { struct bpf_throw_ctx ctx = {}; arch_bpf_stack_walk(bpf_stack_walker, &ctx); WARN_ON_ONCE(!ctx.aux); if (ctx.aux) WARN_ON_ONCE(!ctx.aux->exception_boundary); WARN_ON_ONCE(!ctx.bp); WARN_ON_ONCE(!ctx.cnt); /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning * deeper stack depths than ctx.sp as we do not return from bpf_throw, * which skips compiler generated instrumentation to do the same. */ kasan_unpoison_task_stack_below((void *)(long)ctx.sp); ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0); WARN(1, "A call to BPF exception callback should never return\n"); } __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) { struct bpf_async_kern *async = (struct bpf_async_kern *)wq; struct bpf_map *map = p__map; BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq)); BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq)); if (flags) return -EINVAL; return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); } __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) { struct bpf_async_kern *async = (struct bpf_async_kern *)wq; struct bpf_work *w; if (in_nmi()) return -EOPNOTSUPP; if (flags) return -EINVAL; w = READ_ONCE(async->work); if (!w || !READ_ONCE(w->cb.prog)) return -EINVAL; schedule_work(&w->work); return 0; } __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, void *value), unsigned int flags, void *aux__prog) { struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog; struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) return -EINVAL; return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); } __bpf_kfunc void bpf_preempt_disable(void) { preempt_disable(); } __bpf_kfunc void bpf_preempt_enable(void) { preempt_enable(); } struct bpf_iter_bits { __u64 __opaque[2]; } __aligned(8); #define BITS_ITER_NR_WORDS_MAX 511 struct bpf_iter_bits_kern { union { __u64 *bits; __u64 bits_copy; }; int nr_bits; int bit; } __aligned(8); /* On 64-bit hosts, unsigned long and u64 have the same size, so passing * a u64 pointer and an unsigned long pointer to find_next_bit() will * return the same result, as both point to the same 8-byte area. * * For 32-bit little-endian hosts, using a u64 pointer or unsigned long * pointer also makes no difference. This is because the first iterated * unsigned long is composed of bits 0-31 of the u64 and the second unsigned * long is composed of bits 32-63 of the u64. * * However, for 32-bit big-endian hosts, this is not the case. The first * iterated unsigned long will be bits 32-63 of the u64, so swap these two * ulong values within the u64. */ static void swap_ulong_in_u64(u64 *bits, unsigned int nr) { #if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN) unsigned int i; for (i = 0; i < nr; i++) bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32); #endif } /** * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area * @it: The new bpf_iter_bits to be created * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over * @nr_words: The size of the specified memory area, measured in 8-byte units. * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be * further reduced by the BPF memory allocator implementation. * * This function initializes a new bpf_iter_bits structure for iterating over * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It * copies the data of the memory area to the newly created bpf_iter_bits @it for * subsequent iteration operations. * * On success, 0 is returned. On failure, ERR is returned. */ __bpf_kfunc int bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words) { struct bpf_iter_bits_kern *kit = (void *)it; u32 nr_bytes = nr_words * sizeof(u64); u32 nr_bits = BYTES_TO_BITS(nr_bytes); int err; BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits)); BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) != __alignof__(struct bpf_iter_bits)); kit->nr_bits = 0; kit->bits_copy = 0; kit->bit = -1; if (!unsafe_ptr__ign || !nr_words) return -EINVAL; if (nr_words > BITS_ITER_NR_WORDS_MAX) return -E2BIG; /* Optimization for u64 mask */ if (nr_bits == 64) { err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign); if (err) return -EFAULT; swap_ulong_in_u64(&kit->bits_copy, nr_words); kit->nr_bits = nr_bits; return 0; } if (bpf_mem_alloc_check_size(false, nr_bytes)) return -E2BIG; /* Fallback to memalloc */ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); if (!kit->bits) return -ENOMEM; err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign); if (err) { bpf_mem_free(&bpf_global_ma, kit->bits); return err; } swap_ulong_in_u64(kit->bits, nr_words); kit->nr_bits = nr_bits; return 0; } /** * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits * @it: The bpf_iter_bits to be checked * * This function returns a pointer to a number representing the value of the * next bit in the bits. * * If there are no further bits available, it returns NULL. */ __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; int bit = kit->bit, nr_bits = kit->nr_bits; const void *bits; if (!nr_bits || bit >= nr_bits) return NULL; bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; bit = find_next_bit(bits, nr_bits, bit + 1); if (bit >= nr_bits) { kit->bit = bit; return NULL; } kit->bit = bit; return &kit->bit; } /** * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits * @it: The bpf_iter_bits to be destroyed * * Destroy the resource associated with the bpf_iter_bits. */ __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; if (kit->nr_bits <= 64) return; bpf_mem_free(&bpf_global_ma, kit->bits); } /** * bpf_copy_from_user_str() - Copy a string from an unsafe user address * @dst: Destination address, in kernel space. This buffer must be * at least @dst__sz bytes long. * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. * @unsafe_ptr__ign: Source address, in user space. * @flags: The only supported flag is BPF_F_PAD_ZEROS * * Copies a NUL-terminated string from userspace to BPF space. If user string is * too long this will still ensure zero termination in the dst buffer unless * buffer size is 0. * * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and * memset all of @dst on failure. */ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags) { int ret; if (unlikely(flags & ~BPF_F_PAD_ZEROS)) return -EINVAL; if (unlikely(!dst__sz)) return 0; ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1); if (ret < 0) { if (flags & BPF_F_PAD_ZEROS) memset((char *)dst, 0, dst__sz); return ret; } if (flags & BPF_F_PAD_ZEROS) memset((char *)dst + ret, 0, dst__sz - ret); else ((char *)dst)[ret] = '\0'; return ret + 1; } /** * bpf_copy_from_user_task_str() - Copy a string from an task's address space * @dst: Destination address, in kernel space. This buffer must be * at least @dst__sz bytes long. * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. * @unsafe_ptr__ign: Source address in the task's address space. * @tsk: The task whose address space will be used * @flags: The only supported flag is BPF_F_PAD_ZEROS * * Copies a NUL terminated string from a task's address space to @dst__sz * buffer. If user string is too long this will still ensure zero termination * in the @dst__sz buffer unless buffer size is 0. * * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success * and memset all of @dst__sz on failure. * * Return: The number of copied bytes on success including the NUL terminator. * A negative error code on failure. */ __bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, struct task_struct *tsk, u64 flags) { int ret; if (unlikely(flags & ~BPF_F_PAD_ZEROS)) return -EINVAL; if (unlikely(dst__sz == 0)) return 0; ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0); if (ret < 0) { if (flags & BPF_F_PAD_ZEROS) memset(dst, 0, dst__sz); return ret; } if (flags & BPF_F_PAD_ZEROS) memset(dst + ret, 0, dst__sz - ret); return ret + 1; } /* Keep unsinged long in prototype so that kfunc is usable when emitted to * vmlinux.h in BPF programs directly, but note that while in BPF prog, the * unsigned long always points to 8-byte region on stack, the kernel may only * read and write the 4-bytes on 32-bit. */ __bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag) { local_irq_save(*flags__irq_flag); } __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag) { local_irq_restore(*flags__irq_flag); } __bpf_kfunc void __bpf_trap(void) { } /* * Kfuncs for string operations. * * Since strings are not necessarily %NUL-terminated, we cannot directly call * in-kernel implementations. Instead, we open-code the implementations using * __get_kernel_nofault instead of plain dereference to make them safe. */ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) { char c1, c2; int i; if (!copy_from_kernel_nofault_allowed(s1, 1) || !copy_from_kernel_nofault_allowed(s2, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&c1, s1, char, err_out); __get_kernel_nofault(&c2, s2, char, err_out); if (ignore_case) { c1 = tolower(c1); c2 = tolower(c2); } if (c1 != c2) return c1 < c2 ? -1 : 1; if (c1 == '\0') return 0; s1++; s2++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strcmp - Compare two strings * @s1__ign: One string * @s2__ign: Another string * * Return: * * %0 - Strings are equal * * %-1 - @s1__ign is smaller * * %1 - @s2__ign is smaller * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of strings is too large * * %-ERANGE - One of strings is outside of kernel address space */ __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) { return __bpf_strcasecmp(s1__ign, s2__ign, false); } /** * bpf_strcasecmp - Compare two strings, ignoring the case of the characters * @s1__ign: One string * @s2__ign: Another string * * Return: * * %0 - Strings are equal * * %-1 - @s1__ign is smaller * * %1 - @s2__ign is smaller * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of strings is too large * * %-ERANGE - One of strings is outside of kernel address space */ __bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign) { return __bpf_strcasecmp(s1__ign, s2__ign, true); } /** * bpf_strnchr - Find a character in a length limited string * @s__ign: The string to be searched * @count: The number of characters to be searched * @c: The character to search for * * Note that the %NUL-terminator is considered part of the string, and can * be searched for. * * Return: * * >=0 - Index of the first occurrence of @c within @s__ign * * %-ENOENT - @c not found in the first @count characters of @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c) { char sc; int i; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&sc, s__ign, char, err_out); if (sc == c) return i; if (sc == '\0') return -ENOENT; s__ign++; } return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT; err_out: return -EFAULT; } /** * bpf_strchr - Find the first occurrence of a character in a string * @s__ign: The string to be searched * @c: The character to search for * * Note that the %NUL-terminator is considered part of the string, and can * be searched for. * * Return: * * >=0 - The index of the first occurrence of @c within @s__ign * * %-ENOENT - @c not found in @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strchr(const char *s__ign, char c) { return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c); } /** * bpf_strchrnul - Find and return a character in a string, or end of string * @s__ign: The string to be searched * @c: The character to search for * * Return: * * >=0 - Index of the first occurrence of @c within @s__ign or index of * the null byte at the end of @s__ign when @c is not found * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strchrnul(const char *s__ign, char c) { char sc; int i; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&sc, s__ign, char, err_out); if (sc == '\0' || sc == c) return i; s__ign++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strrchr - Find the last occurrence of a character in a string * @s__ign: The string to be searched * @c: The character to search for * * Return: * * >=0 - Index of the last occurrence of @c within @s__ign * * %-ENOENT - @c not found in @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strrchr(const char *s__ign, int c) { char sc; int i, last = -ENOENT; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&sc, s__ign, char, err_out); if (sc == c) last = i; if (sc == '\0') return last; s__ign++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strnlen - Calculate the length of a length-limited string * @s__ign: The string * @count: The maximum number of characters to count * * Return: * * >=0 - The length of @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count) { char c; int i; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&c, s__ign, char, err_out); if (c == '\0') return i; s__ign++; } return i == XATTR_SIZE_MAX ? -E2BIG : i; err_out: return -EFAULT; } /** * bpf_strlen - Calculate the length of a string * @s__ign: The string * * Return: * * >=0 - The length of @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strlen(const char *s__ign) { return bpf_strnlen(s__ign, XATTR_SIZE_MAX); } /** * bpf_strspn - Calculate the length of the initial substring of @s__ign which * only contains letters in @accept__ign * @s__ign: The string to be searched * @accept__ign: The string to search for * * Return: * * >=0 - The length of the initial substring of @s__ign which only * contains letters from @accept__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign) { char cs, ca; int i, j; if (!copy_from_kernel_nofault_allowed(s__ign, 1) || !copy_from_kernel_nofault_allowed(accept__ign, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&cs, s__ign, char, err_out); if (cs == '\0') return i; for (j = 0; j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&ca, accept__ign + j, char, err_out); if (cs == ca || ca == '\0') break; } if (j == XATTR_SIZE_MAX) return -E2BIG; if (ca == '\0') return i; s__ign++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strcspn - Calculate the length of the initial substring of @s__ign which * does not contain letters in @reject__ign * @s__ign: The string to be searched * @reject__ign: The string to search for * * Return: * * >=0 - The length of the initial substring of @s__ign which does not * contain letters from @reject__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign) { char cs, cr; int i, j; if (!copy_from_kernel_nofault_allowed(s__ign, 1) || !copy_from_kernel_nofault_allowed(reject__ign, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&cs, s__ign, char, err_out); if (cs == '\0') return i; for (j = 0; j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&cr, reject__ign + j, char, err_out); if (cs == cr || cr == '\0') break; } if (j == XATTR_SIZE_MAX) return -E2BIG; if (cr != '\0') return i; s__ign++; } return -E2BIG; err_out: return -EFAULT; } static int __bpf_strnstr(const char *s1, const char *s2, size_t len, bool ignore_case) { char c1, c2; int i, j; if (!copy_from_kernel_nofault_allowed(s1, 1) || !copy_from_kernel_nofault_allowed(s2, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&c2, s2 + j, char, err_out); if (c2 == '\0') return i; /* * We allow reading an extra byte from s2 (note the * `i + j <= len` above) to cover the case when s2 is * a suffix of the first len chars of s1. */ if (i + j == len) break; __get_kernel_nofault(&c1, s1 + j, char, err_out); if (ignore_case) { c1 = tolower(c1); c2 = tolower(c2); } if (c1 == '\0') return -ENOENT; if (c1 != c2) break; } if (j == XATTR_SIZE_MAX) return -E2BIG; if (i + j == len) return -ENOENT; s1++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strstr - Find the first substring in a string * @s1__ign: The string to be searched * @s2__ign: The string to search for * * Return: * * >=0 - Index of the first character of the first occurrence of @s2__ign * within @s1__ign * * %-ENOENT - @s2__ign is not a substring of @s1__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) { return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false); } /** * bpf_strcasestr - Find the first substring in a string, ignoring the case of * the characters * @s1__ign: The string to be searched * @s2__ign: The string to search for * * Return: * * >=0 - Index of the first character of the first occurrence of @s2__ign * within @s1__ign * * %-ENOENT - @s2__ign is not a substring of @s1__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign) { return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true); } /** * bpf_strnstr - Find the first substring in a length-limited string * @s1__ign: The string to be searched * @s2__ign: The string to search for * @len: the maximum number of characters to search * * Return: * * >=0 - Index of the first character of the first occurrence of @s2__ign * within the first @len characters of @s1__ign * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len) { return __bpf_strnstr(s1__ign, s2__ign, len, false); } /** * bpf_strncasestr - Find the first substring in a length-limited string, * ignoring the case of the characters * @s1__ign: The string to be searched * @s2__ign: The string to search for * @len: the maximum number of characters to search * * Return: * * >=0 - Index of the first character of the first occurrence of @s2__ign * within the first @len characters of @s1__ign * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign, size_t len) { return __bpf_strnstr(s1__ign, s2__ign, len, true); } #ifdef CONFIG_KEYS /** * bpf_lookup_user_key - lookup a key by its serial * @serial: key handle serial number * @flags: lookup-specific flags * * Search a key with a given *serial* and the provided *flags*. * If found, increment the reference count of the key by one, and * return it in the bpf_key structure. * * The bpf_key structure must be passed to bpf_key_put() when done * with it, so that the key reference count is decremented and the * bpf_key structure is freed. * * Permission checks are deferred to the time the key is used by * one of the available key-specific kfuncs. * * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested * special keyring (e.g. session keyring), if it doesn't yet exist. * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting * for the key construction, and to retrieve uninstantiated keys (keys * without data attached to them). * * Return: a bpf_key pointer with a valid key pointer if the key is found, a * NULL pointer otherwise. */ __bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) { key_ref_t key_ref; struct bpf_key *bkey; if (flags & ~KEY_LOOKUP_ALL) return NULL; /* * Permission check is deferred until the key is used, as the * intent of the caller is unknown here. */ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); if (IS_ERR(key_ref)) return NULL; bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); if (!bkey) { key_put(key_ref_to_ptr(key_ref)); return NULL; } bkey->key = key_ref_to_ptr(key_ref); bkey->has_ref = true; return bkey; } /** * bpf_lookup_system_key - lookup a key by a system-defined ID * @id: key ID * * Obtain a bpf_key structure with a key pointer set to the passed key ID. * The key pointer is marked as invalid, to prevent bpf_key_put() from * attempting to decrement the key reference count on that pointer. The key * pointer set in such way is currently understood only by * verify_pkcs7_signature(). * * Set *id* to one of the values defined in include/linux/verification.h: * 0 for the primary keyring (immutable keyring of system keys); * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring * (where keys can be added only if they are vouched for by existing keys * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform * keyring (primarily used by the integrity subsystem to verify a kexec'ed * kerned image and, possibly, the initramfs signature). * * Return: a bpf_key pointer with an invalid key pointer set from the * pre-determined ID on success, a NULL pointer otherwise */ __bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) { struct bpf_key *bkey; if (system_keyring_id_check(id) < 0) return NULL; bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); if (!bkey) return NULL; bkey->key = (struct key *)(unsigned long)id; bkey->has_ref = false; return bkey; } /** * bpf_key_put - decrement key reference count if key is valid and free bpf_key * @bkey: bpf_key structure * * Decrement the reference count of the key inside *bkey*, if the pointer * is valid, and free *bkey*. */ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey) { if (bkey->has_ref) key_put(bkey->key); kfree(bkey); } /** * bpf_verify_pkcs7_signature - verify a PKCS#7 signature * @data_p: data to verify * @sig_p: signature of the data * @trusted_keyring: keyring with keys trusted for signature verification * * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* * with keys in a keyring referenced by *trusted_keyring*. * * Return: 0 on success, a negative value on error. */ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { #ifdef CONFIG_SYSTEM_DATA_VERIFICATION struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; const void *data, *sig; u32 data_len, sig_len; int ret; if (trusted_keyring->has_ref) { /* * Do the permission check deferred in bpf_lookup_user_key(). * See bpf_lookup_user_key() for more details. * * A call to key_task_permission() here would be redundant, as * it is already done by keyring_search() called by * find_asymmetric_key(). */ ret = key_validate(trusted_keyring->key); if (ret < 0) return ret; } data_len = __bpf_dynptr_size(data_ptr); data = __bpf_dynptr_data(data_ptr, data_len); sig_len = __bpf_dynptr_size(sig_ptr); sig = __bpf_dynptr_data(sig_ptr, sig_len); return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, VERIFYING_BPF_SIGNATURE, NULL, NULL); #else return -EOPNOTSUPP; #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ } #endif /* CONFIG_KEYS */ typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); enum bpf_task_work_state { /* bpf_task_work is ready to be used */ BPF_TW_STANDBY = 0, /* irq work scheduling in progress */ BPF_TW_PENDING, /* task work scheduling in progress */ BPF_TW_SCHEDULING, /* task work is scheduled successfully */ BPF_TW_SCHEDULED, /* callback is running */ BPF_TW_RUNNING, /* associated BPF map value is deleted */ BPF_TW_FREED, }; struct bpf_task_work_ctx { enum bpf_task_work_state state; refcount_t refcnt; struct callback_head work; struct irq_work irq_work; /* bpf_prog that schedules task work */ struct bpf_prog *prog; /* task for which callback is scheduled */ struct task_struct *task; /* the map and map value associated with this context */ struct bpf_map *map; void *map_val; enum task_work_notify_mode mode; bpf_task_work_callback_t callback_fn; struct rcu_head rcu; } __aligned(8); /* Actual type for struct bpf_task_work */ struct bpf_task_work_kern { struct bpf_task_work_ctx *ctx; }; static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx) { if (ctx->prog) { bpf_prog_put(ctx->prog); ctx->prog = NULL; } if (ctx->task) { bpf_task_release(ctx->task); ctx->task = NULL; } } static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx) { return refcount_inc_not_zero(&ctx->refcnt); } static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx) { if (!refcount_dec_and_test(&ctx->refcnt)) return; bpf_task_work_ctx_reset(ctx); /* bpf_mem_free expects migration to be disabled */ migrate_disable(); bpf_mem_free(&bpf_global_ma, ctx); migrate_enable(); } static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx) { /* * Scheduled task_work callback holds ctx ref, so if we successfully * cancelled, we put that ref on callback's behalf. If we couldn't * cancel, callback will inevitably run or has already completed * running, and it would have taken care of its ctx ref itself. */ if (task_work_cancel(ctx->task, &ctx->work)) bpf_task_work_ctx_put(ctx); } static void bpf_task_work_callback(struct callback_head *cb) { struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work); enum bpf_task_work_state state; u32 idx; void *key; /* Read lock is needed to protect ctx and map key/value access */ guard(rcu_tasks_trace)(); /* * This callback may start running before bpf_task_work_irq() switched to * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING. */ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING); if (state == BPF_TW_SCHEDULED) state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING); if (state == BPF_TW_FREED) { bpf_task_work_ctx_put(ctx); return; } key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx); migrate_disable(); ctx->callback_fn(ctx->map, key, ctx->map_val); migrate_enable(); bpf_task_work_ctx_reset(ctx); (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY); bpf_task_work_ctx_put(ctx); } static void bpf_task_work_irq(struct irq_work *irq_work) { struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); enum bpf_task_work_state state; int err; guard(rcu_tasks_trace)(); if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) { bpf_task_work_ctx_put(ctx); return; } err = task_work_add(ctx->task, &ctx->work, ctx->mode); if (err) { bpf_task_work_ctx_reset(ctx); /* * try to switch back to STANDBY for another task_work reuse, but we might have * gone to FREED already, which is fine as we already cleaned up after ourselves */ (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY); bpf_task_work_ctx_put(ctx); return; } /* * It's technically possible for just scheduled task_work callback to * complete running by now, going SCHEDULING -> RUNNING and then * dropping its ctx refcount. Instead of capturing extra ref just to * protected below ctx->state access, we rely on RCU protection to * perform below SCHEDULING -> SCHEDULED attempt. */ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED); if (state == BPF_TW_FREED) bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */ } static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw, struct bpf_map *map) { struct bpf_task_work_kern *twk = (void *)tw; struct bpf_task_work_ctx *ctx, *old_ctx; ctx = READ_ONCE(twk->ctx); if (ctx) return ctx; ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx)); if (!ctx) return ERR_PTR(-ENOMEM); memset(ctx, 0, sizeof(*ctx)); refcount_set(&ctx->refcnt, 1); /* map's own ref */ ctx->state = BPF_TW_STANDBY; old_ctx = cmpxchg(&twk->ctx, NULL, ctx); if (old_ctx) { /* * tw->ctx is set by concurrent BPF program, release allocated * memory and try to reuse already set context. */ bpf_mem_free(&bpf_global_ma, ctx); return old_ctx; } return ctx; /* Success */ } static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw, struct bpf_map *map) { struct bpf_task_work_ctx *ctx; ctx = bpf_task_work_fetch_ctx(tw, map); if (IS_ERR(ctx)) return ctx; /* try to get ref for task_work callback to hold */ if (!bpf_task_work_ctx_tryget(ctx)) return ERR_PTR(-EBUSY); if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) { /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */ bpf_task_work_ctx_put(ctx); return ERR_PTR(-EBUSY); } /* * If no process or bpffs is holding a reference to the map, no new callbacks should be * scheduled. This does not address any race or correctness issue, but rather is a policy * choice: dropping user references should stop everything. */ if (!atomic64_read(&map->usercnt)) { /* drop ref we just got for task_work callback itself */ bpf_task_work_ctx_put(ctx); /* transfer map's ref into cancel_and_free() */ bpf_task_work_cancel_and_free(tw); return ERR_PTR(-EBUSY); } return ctx; } static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw, struct bpf_map *map, bpf_task_work_callback_t callback_fn, struct bpf_prog_aux *aux, enum task_work_notify_mode mode) { struct bpf_prog *prog; struct bpf_task_work_ctx *ctx; int err; BTF_TYPE_EMIT(struct bpf_task_work); prog = bpf_prog_inc_not_zero(aux->prog); if (IS_ERR(prog)) return -EBADF; task = bpf_task_acquire(task); if (!task) { err = -EBADF; goto release_prog; } ctx = bpf_task_work_acquire_ctx(tw, map); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto release_all; } ctx->task = task; ctx->callback_fn = callback_fn; ctx->prog = prog; ctx->mode = mode; ctx->map = map; ctx->map_val = (void *)tw - map->record->task_work_off; init_task_work(&ctx->work, bpf_task_work_callback); init_irq_work(&ctx->irq_work, bpf_task_work_irq); irq_work_queue(&ctx->irq_work); return 0; release_all: bpf_task_release(task); release_prog: bpf_prog_put(prog); return err; } /** * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call * @aux__prog: user should pass NULL * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ __bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, struct bpf_task_work *tw, void *map__map, bpf_task_work_callback_t callback, void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); } /** * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call * @aux__prog: user should pass NULL * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ __bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, struct bpf_task_work *tw, void *map__map, bpf_task_work_callback_t callback, void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, struct bpf_dynptr_kern *ptr) { struct bpf_dynptr_file_impl *state; /* flags is currently unsupported */ if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl)); if (!state) { bpf_dynptr_set_null(ptr); return -ENOMEM; } state->offset = 0; state->size = U64_MAX; /* Don't restrict size, as file may change anyways */ freader_init_from_file(&state->freader, NULL, 0, file, may_sleep); bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0); bpf_dynptr_set_rdonly(ptr); return 0; } __bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) { return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit); } int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) { return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit); } __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr; struct bpf_dynptr_file_impl *df = ptr->data; if (!df) return 0; freader_cleanup(&df->freader); bpf_mem_free(&bpf_global_ma, df); bpf_dynptr_set_null(ptr); return 0; } __bpf_kfunc_end_defs(); static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) { struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */ bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */ } void bpf_task_work_cancel_and_free(void *val) { struct bpf_task_work_kern *twk = val; struct bpf_task_work_ctx *ctx; enum bpf_task_work_state state; ctx = xchg(&twk->ctx, NULL); if (!ctx) return; state = xchg(&ctx->state, BPF_TW_FREED); if (state == BPF_TW_SCHEDULED) { /* run in irq_work to avoid locks in NMI */ init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled); irq_work_queue(&ctx->irq_work); return; } bpf_task_work_ctx_put(ctx); /* put bpf map's ref */ } BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) #endif #ifdef CONFIG_KEYS BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) #ifdef CONFIG_SYSTEM_DATA_VERIFICATION BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) #endif #endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { .owner = THIS_MODULE, .set = &generic_btf_ids, }; BTF_ID_LIST(generic_dtor_ids) BTF_ID(struct, task_struct) BTF_ID(func, bpf_task_release_dtor) #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) BTF_ID(func, bpf_cgroup_release_dtor) #endif BTF_KFUNCS_START(common_btf_ids) BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL) BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL) BTF_ID_FLAGS(func, bpf_rcu_read_lock) BTF_ID_FLAGS(func, bpf_rcu_read_unlock) BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) #endif BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_dynptr_copy) BTF_ID_FLAGS(func, bpf_dynptr_memset) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif BTF_ID_FLAGS(func, bpf_wq_init) BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) BTF_ID_FLAGS(func, bpf_wq_start) BTF_ID_FLAGS(func, bpf_preempt_disable) BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_get_kmem_cache) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_local_irq_save) BTF_ID_FLAGS(func, bpf_local_irq_restore) #ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) #endif #ifdef CONFIG_DMA_SHARED_BUFFER BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) #endif BTF_ID_FLAGS(func, __bpf_trap) BTF_ID_FLAGS(func, bpf_strcmp); BTF_ID_FLAGS(func, bpf_strcasecmp); BTF_ID_FLAGS(func, bpf_strchr); BTF_ID_FLAGS(func, bpf_strchrnul); BTF_ID_FLAGS(func, bpf_strnchr); BTF_ID_FLAGS(func, bpf_strrchr); BTF_ID_FLAGS(func, bpf_strlen); BTF_ID_FLAGS(func, bpf_strnlen); BTF_ID_FLAGS(func, bpf_strspn); BTF_ID_FLAGS(func, bpf_strcspn); BTF_ID_FLAGS(func, bpf_strstr); BTF_ID_FLAGS(func, bpf_strcasestr); BTF_ID_FLAGS(func, bpf_strnstr); BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { .owner = THIS_MODULE, .set = &common_btf_ids, }; static int __init kfunc_init(void) { int ret; const struct btf_id_dtor_kfunc generic_dtors[] = { { .btf_id = generic_dtor_ids[0], .kfunc_btf_id = generic_dtor_ids[1] }, #ifdef CONFIG_CGROUPS { .btf_id = generic_dtor_ids[2], .kfunc_btf_id = generic_dtor_ids[3] }, #endif }; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), THIS_MODULE); return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); /* Get a pointer to dynptr data up to len bytes for read only access. If * the dynptr doesn't have continuous data up to len bytes, return NULL. */ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len) { const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr; return bpf_dynptr_slice(p, 0, NULL, len); } /* Get a pointer to dynptr data up to len bytes for read write access. If * the dynptr doesn't have continuous data up to len bytes, or the dynptr * is read only, return NULL. */ void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len) { if (__bpf_dynptr_is_rdonly(ptr)) return NULL; return (void *)__bpf_dynptr_data(ptr, len); } void bpf_map_free_internal_structs(struct bpf_map *map, void *val) { if (btf_record_has_field(map->record, BPF_TIMER)) bpf_obj_free_timer(map->record, val); if (btf_record_has_field(map->record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(map->record, val); if (btf_record_has_field(map->record, BPF_TASK_WORK)) bpf_obj_free_task_work(map->record, val); }
8 1 2 11 11 11 1 1 8 8 8 8 8 8 8 8 8 8 8 8 8 23 1 2 21 53 51 2 53 3 26 26 26 26 26 23 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Range as input support for IPv4 added */ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ /* 7 skbinfo support added */ #define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); /* Type specific function prefix */ #define HTYPE hash_ipportnet /* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 * However this way we have to store internally cidr - 1, * dancing back and forth. */ #define IP_SET_HASH_WITH_NETS_PACKED #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS /* IPv4 variant */ /* Member elements */ struct hash_ipportnet4_elem { __be32 ip; __be32 ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; }; /* Common functions */ static bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->ip2 == ip2->ip2 && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) { elem->ip2 &= ip_set_netmask(cidr); elem->cidr = cidr - 1; } static bool hash_ipportnet4_data_list(struct sk_buff *skb, const struct hash_ipportnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, const struct hash_ipportnet4_elem *d) { next->ip = d->ip; next->port = d->port; next->ip2 = d->ip2; } #define MTYPE hash_ipportnet4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); e.ip2 &= ip_set_netmask(e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); if (ret) return ret; if (tb[IPSET_ATTR_CIDR2]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { e.ip = htonl(ip); e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1)); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); if (ret) return ret; if (ip2_from > ip2_to) swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); } if (retried) { ip = ntohl(h->next.ip); p = ntohs(h->next.port); ip2 = ntohl(h->next.ip2); } else { p = port; ip2 = ip2_from; } for (; ip <= ip_to; ip++) { e.ip = htonl(ip); for (; p <= port_to; p++) { e.port = htons(p); do { i++; e.ip2 = htonl(ip2); ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr); e.cidr = cidr - 1; if (i > IPSET_MAX_RANGE) { hash_ipportnet4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } while (ip2++ < ip2_to); ip2 = ip2_from; } p = port; } return ret; } /* IPv6 variant */ struct hash_ipportnet6_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; }; /* Common functions */ static bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip2, cidr); elem->cidr = cidr - 1; } static bool hash_ipportnet6_data_list(struct sk_buff *skb, const struct hash_ipportnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next, const struct hash_ipportnet6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipportnet6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); ip6_netmask(&e.ip2, e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; if (tb[IPSET_ATTR_CIDR2]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } ip6_netmask(&e.ip2, e.cidr + 1); e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipportnet_type __read_mostly = { .name = "hash:ip,port,net", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_THREE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipportnet_init(void) { return ip_set_type_register(&hash_ipportnet_type); } static void __exit hash_ipportnet_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipportnet_type); } module_init(hash_ipportnet_init); module_exit(hash_ipportnet_fini);
4827 5 3382 3378 19 3 15 1 5 1 2 5 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 instruction attribute tables * * Written by Masami Hiramatsu <mhiramat@redhat.com> */ #include <asm/insn.h> /* __ignore_sync_check__ */ /* Attribute tables are generated from opcode map */ #include "inat-tables.c" /* Attribute search APIs */ insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) { return inat_primary_table[opcode]; } int inat_get_last_prefix_id(insn_byte_t last_pfx) { insn_attr_t lpfx_attr; lpfx_attr = inat_get_opcode_attribute(last_pfx); return inat_last_prefix_id(lpfx_attr); } insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id, insn_attr_t esc_attr) { const insn_attr_t *table; int n; n = inat_escape_id(esc_attr); table = inat_escape_tables[n][0]; if (!table) return 0; if (inat_has_variant(table[opcode]) && lpfx_id) { table = inat_escape_tables[n][lpfx_id]; if (!table) return 0; } return table[opcode]; } insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id, insn_attr_t grp_attr) { const insn_attr_t *table; int n; n = inat_group_id(grp_attr); table = inat_group_tables[n][0]; if (!table) return inat_group_common_attribute(grp_attr); if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) { table = inat_group_tables[n][lpfx_id]; if (!table) return inat_group_common_attribute(grp_attr); } return table[X86_MODRM_REG(modrm)] | inat_group_common_attribute(grp_attr); } insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_p) { const insn_attr_t *table; if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) return 0; /* At first, this checks the master table */ table = inat_avx_tables[vex_m][0]; if (!table) return 0; if (!inat_is_group(table[opcode]) && vex_p) { /* If this is not a group, get attribute directly */ table = inat_avx_tables[vex_m][vex_p]; if (!table) return 0; } return table[opcode]; } insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) { const insn_attr_t *table; if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) return 0; map_select -= X86_XOP_M_MIN; /* At first, this checks the master table */ table = inat_xop_tables[map_select]; if (!table) return 0; return table[opcode]; }
37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * This file holds USB constants and structures that are needed for * USB device APIs. These are used by the USB device model, which is * defined in chapter 9 of the USB 2.0 specification and in the * Wireless USB 1.0 spec (now defunct). Linux has several APIs in C that * need these: * * - the master/host side Linux-USB kernel driver API; * - the "usbfs" user space API; and * - the Linux "gadget" slave/device/peripheral side driver API. * * USB 2.0 adds an additional "On The Go" (OTG) mode, which lets systems * act either as a USB master/host or as a USB slave/device. That means * the master and slave side APIs benefit from working well together. * * Note all descriptors are declared '__attribute__((packed))' so that: * * [a] they never get padded, either internally (USB spec writers * probably handled that) or externally; * * [b] so that accessing bigger-than-a-bytes fields will never * generate bus errors on any platform, even when the location of * its descriptor inside a bundle isn't "naturally aligned", and * * [c] for consistency, removing all doubt even when it appears to * someone that the two other points are non-issues for that * particular descriptor type. */ #ifndef _UAPI__LINUX_USB_CH9_H #define _UAPI__LINUX_USB_CH9_H #include <linux/types.h> /* __u8 etc */ #include <asm/byteorder.h> /* le16_to_cpu */ /*-------------------------------------------------------------------------*/ /* CONTROL REQUEST SUPPORT */ /* * USB directions * * This bit flag is used in endpoint descriptors' bEndpointAddress field. * It's also one of three fields in control requests bRequestType. */ #define USB_DIR_OUT 0 /* to device */ #define USB_DIR_IN 0x80 /* to host */ /* * USB types, the second of three bRequestType fields */ #define USB_TYPE_MASK (0x03 << 5) #define USB_TYPE_STANDARD (0x00 << 5) #define USB_TYPE_CLASS (0x01 << 5) #define USB_TYPE_VENDOR (0x02 << 5) #define USB_TYPE_RESERVED (0x03 << 5) /* * USB recipients, the third of three bRequestType fields */ #define USB_RECIP_MASK 0x1f #define USB_RECIP_DEVICE 0x00 #define USB_RECIP_INTERFACE 0x01 #define USB_RECIP_ENDPOINT 0x02 #define USB_RECIP_OTHER 0x03 /* From Wireless USB 1.0 */ #define USB_RECIP_PORT 0x04 #define USB_RECIP_RPIPE 0x05 /* * Standard requests, for the bRequest field of a SETUP packet. * * These are qualified by the bRequestType field, so that for example * TYPE_CLASS or TYPE_VENDOR specific feature flags could be retrieved * by a GET_STATUS request. */ #define USB_REQ_GET_STATUS 0x00 #define USB_REQ_CLEAR_FEATURE 0x01 #define USB_REQ_SET_FEATURE 0x03 #define USB_REQ_SET_ADDRESS 0x05 #define USB_REQ_GET_DESCRIPTOR 0x06 #define USB_REQ_SET_DESCRIPTOR 0x07 #define USB_REQ_GET_CONFIGURATION 0x08 #define USB_REQ_SET_CONFIGURATION 0x09 #define USB_REQ_GET_INTERFACE 0x0A #define USB_REQ_SET_INTERFACE 0x0B #define USB_REQ_SYNCH_FRAME 0x0C #define USB_REQ_SET_SEL 0x30 #define USB_REQ_SET_ISOCH_DELAY 0x31 #define USB_REQ_SET_ENCRYPTION 0x0D /* Wireless USB */ #define USB_REQ_GET_ENCRYPTION 0x0E #define USB_REQ_RPIPE_ABORT 0x0E #define USB_REQ_SET_HANDSHAKE 0x0F #define USB_REQ_RPIPE_RESET 0x0F #define USB_REQ_GET_HANDSHAKE 0x10 #define USB_REQ_SET_CONNECTION 0x11 #define USB_REQ_SET_SECURITY_DATA 0x12 #define USB_REQ_GET_SECURITY_DATA 0x13 #define USB_REQ_SET_WUSB_DATA 0x14 #define USB_REQ_LOOPBACK_DATA_WRITE 0x15 #define USB_REQ_LOOPBACK_DATA_READ 0x16 #define USB_REQ_SET_INTERFACE_DS 0x17 /* specific requests for USB Power Delivery */ #define USB_REQ_GET_PARTNER_PDO 20 #define USB_REQ_GET_BATTERY_STATUS 21 #define USB_REQ_SET_PDO 22 #define USB_REQ_GET_VDM 23 #define USB_REQ_SEND_VDM 24 /* The Link Power Management (LPM) ECN defines USB_REQ_TEST_AND_SET command, * used by hubs to put ports into a new L1 suspend state, except that it * forgot to define its number ... */ /* * USB feature flags are written using USB_REQ_{CLEAR,SET}_FEATURE, and * are read as a bit array returned by USB_REQ_GET_STATUS. (So there * are at most sixteen features of each type.) Hubs may also support a * new USB_REQ_TEST_AND_SET_FEATURE to put ports into L1 suspend. */ #define USB_DEVICE_SELF_POWERED 0 /* (read only) */ #define USB_DEVICE_REMOTE_WAKEUP 1 /* dev may initiate wakeup */ #define USB_DEVICE_TEST_MODE 2 /* (wired high speed only) */ #define USB_DEVICE_BATTERY 2 /* (wireless) */ #define USB_DEVICE_B_HNP_ENABLE 3 /* (otg) dev may initiate HNP */ #define USB_DEVICE_WUSB_DEVICE 3 /* (wireless)*/ #define USB_DEVICE_A_HNP_SUPPORT 4 /* (otg) RH port supports HNP */ #define USB_DEVICE_A_ALT_HNP_SUPPORT 5 /* (otg) other RH port does */ #define USB_DEVICE_DEBUG_MODE 6 /* (special devices only) */ /* * Test Mode Selectors * See USB 2.0 spec Table 9-7 */ #define USB_TEST_J 1 #define USB_TEST_K 2 #define USB_TEST_SE0_NAK 3 #define USB_TEST_PACKET 4 #define USB_TEST_FORCE_ENABLE 5 /* Status Type */ #define USB_STATUS_TYPE_STANDARD 0 #define USB_STATUS_TYPE_PTM 1 /* * New Feature Selectors as added by USB 3.0 * See USB 3.0 spec Table 9-7 */ #define USB_DEVICE_U1_ENABLE 48 /* dev may initiate U1 transition */ #define USB_DEVICE_U2_ENABLE 49 /* dev may initiate U2 transition */ #define USB_DEVICE_LTM_ENABLE 50 /* dev may send LTM */ #define USB_INTRF_FUNC_SUSPEND 0 /* function suspend */ #define USB_INTR_FUNC_SUSPEND_OPT_MASK 0xFF00 /* * Suspend Options, Table 9-8 USB 3.0 spec */ #define USB_INTRF_FUNC_SUSPEND_LP (1 << (8 + 0)) #define USB_INTRF_FUNC_SUSPEND_RW (1 << (8 + 1)) /* * Interface status, Figure 9-5 USB 3.0 spec */ #define USB_INTRF_STAT_FUNC_RW_CAP 1 #define USB_INTRF_STAT_FUNC_RW 2 #define USB_ENDPOINT_HALT 0 /* IN/OUT will STALL */ /* Bit array elements as returned by the USB_REQ_GET_STATUS request. */ #define USB_DEV_STAT_U1_ENABLED 2 /* transition into U1 state */ #define USB_DEV_STAT_U2_ENABLED 3 /* transition into U2 state */ #define USB_DEV_STAT_LTM_ENABLED 4 /* Latency tolerance messages */ /* * Feature selectors from Table 9-8 USB Power Delivery spec */ #define USB_DEVICE_BATTERY_WAKE_MASK 40 #define USB_DEVICE_OS_IS_PD_AWARE 41 #define USB_DEVICE_POLICY_MODE 42 #define USB_PORT_PR_SWAP 43 #define USB_PORT_GOTO_MIN 44 #define USB_PORT_RETURN_POWER 45 #define USB_PORT_ACCEPT_PD_REQUEST 46 #define USB_PORT_REJECT_PD_REQUEST 47 #define USB_PORT_PORT_PD_RESET 48 #define USB_PORT_C_PORT_PD_CHANGE 49 #define USB_PORT_CABLE_PD_RESET 50 #define USB_DEVICE_CHARGING_POLICY 54 /** * struct usb_ctrlrequest - SETUP data for a USB device control request * @bRequestType: matches the USB bmRequestType field * @bRequest: matches the USB bRequest field * @wValue: matches the USB wValue field (le16 byte order) * @wIndex: matches the USB wIndex field (le16 byte order) * @wLength: matches the USB wLength field (le16 byte order) * * This structure is used to send control requests to a USB device. It matches * the different fields of the USB 2.0 Spec section 9.3, table 9-2. See the * USB spec for a fuller description of the different fields, and what they are * used for. * * Note that the driver for any interface can issue control requests. * For most devices, interfaces don't coordinate with each other, so * such requests may be made at any time. */ struct usb_ctrlrequest { __u8 bRequestType; __u8 bRequest; __le16 wValue; __le16 wIndex; __le16 wLength; } __attribute__ ((packed)); /*-------------------------------------------------------------------------*/ /* * STANDARD DESCRIPTORS ... as returned by GET_DESCRIPTOR, or * (rarely) accepted by SET_DESCRIPTOR. * * Note that all multi-byte values here are encoded in little endian * byte order "on the wire". Within the kernel and when exposed * through the Linux-USB APIs, they are not converted to cpu byte * order; it is the responsibility of the client code to do this. * The single exception is when device and configuration descriptors (but * not other descriptors) are read from character devices * (i.e. /dev/bus/usb/BBB/DDD); * in this case the fields are converted to host endianness by the kernel. */ /* * Descriptor types ... USB 2.0 spec table 9.5 */ #define USB_DT_DEVICE 0x01 #define USB_DT_CONFIG 0x02 #define USB_DT_STRING 0x03 #define USB_DT_INTERFACE 0x04 #define USB_DT_ENDPOINT 0x05 #define USB_DT_DEVICE_QUALIFIER 0x06 #define USB_DT_OTHER_SPEED_CONFIG 0x07 #define USB_DT_INTERFACE_POWER 0x08 /* these are from a minor usb 2.0 revision (ECN) */ #define USB_DT_OTG 0x09 #define USB_DT_DEBUG 0x0a #define USB_DT_INTERFACE_ASSOCIATION 0x0b /* these are from the Wireless USB spec */ #define USB_DT_SECURITY 0x0c #define USB_DT_KEY 0x0d #define USB_DT_ENCRYPTION_TYPE 0x0e #define USB_DT_BOS 0x0f #define USB_DT_DEVICE_CAPABILITY 0x10 #define USB_DT_WIRELESS_ENDPOINT_COMP 0x11 /* From the eUSB2 spec */ #define USB_DT_EUSB2_ISOC_ENDPOINT_COMP 0x12 /* From Wireless USB spec */ #define USB_DT_WIRE_ADAPTER 0x21 /* From USB Device Firmware Upgrade Specification, Revision 1.1 */ #define USB_DT_DFU_FUNCTIONAL 0x21 /* these are from the Wireless USB spec */ #define USB_DT_RPIPE 0x22 #define USB_DT_CS_RADIO_CONTROL 0x23 /* From the T10 UAS specification */ #define USB_DT_PIPE_USAGE 0x24 /* From the USB 3.0 spec */ #define USB_DT_SS_ENDPOINT_COMP 0x30 /* From the USB 3.1 spec */ #define USB_DT_SSP_ISOC_ENDPOINT_COMP 0x31 /* Conventional codes for class-specific descriptors. The convention is * defined in the USB "Common Class" Spec (3.11). Individual class specs * are authoritative for their usage, not the "common class" writeup. */ #define USB_DT_CS_DEVICE (USB_TYPE_CLASS | USB_DT_DEVICE) #define USB_DT_CS_CONFIG (USB_TYPE_CLASS | USB_DT_CONFIG) #define USB_DT_CS_STRING (USB_TYPE_CLASS | USB_DT_STRING) #define USB_DT_CS_INTERFACE (USB_TYPE_CLASS | USB_DT_INTERFACE) #define USB_DT_CS_ENDPOINT (USB_TYPE_CLASS | USB_DT_ENDPOINT) /* All standard descriptors have these 2 fields at the beginning */ struct usb_descriptor_header { __u8 bLength; __u8 bDescriptorType; } __attribute__ ((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_DEVICE: Device descriptor */ struct usb_device_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 bcdUSB; __u8 bDeviceClass; __u8 bDeviceSubClass; __u8 bDeviceProtocol; __u8 bMaxPacketSize0; __le16 idVendor; __le16 idProduct; __le16 bcdDevice; __u8 iManufacturer; __u8 iProduct; __u8 iSerialNumber; __u8 bNumConfigurations; } __attribute__ ((packed)); #define USB_DT_DEVICE_SIZE 18 /* * Device and/or Interface Class codes * as found in bDeviceClass or bInterfaceClass * and defined by www.usb.org documents */ #define USB_CLASS_PER_INTERFACE 0 /* for DeviceClass */ #define USB_CLASS_AUDIO 1 #define USB_CLASS_COMM 2 #define USB_CLASS_HID 3 #define USB_CLASS_PHYSICAL 5 #define USB_CLASS_STILL_IMAGE 6 #define USB_CLASS_PRINTER 7 #define USB_CLASS_MASS_STORAGE 8 #define USB_CLASS_HUB 9 #define USB_CLASS_CDC_DATA 0x0a #define USB_CLASS_CSCID 0x0b /* chip+ smart card */ #define USB_CLASS_CONTENT_SEC 0x0d /* content security */ #define USB_CLASS_VIDEO 0x0e #define USB_CLASS_WIRELESS_CONTROLLER 0xe0 #define USB_CLASS_PERSONAL_HEALTHCARE 0x0f #define USB_CLASS_AUDIO_VIDEO 0x10 #define USB_CLASS_BILLBOARD 0x11 #define USB_CLASS_USB_TYPE_C_BRIDGE 0x12 #define USB_CLASS_MCTP 0x14 #define USB_CLASS_MISC 0xef #define USB_CLASS_APP_SPEC 0xfe #define USB_SUBCLASS_DFU 0x01 #define USB_CLASS_VENDOR_SPEC 0xff #define USB_SUBCLASS_VENDOR_SPEC 0xff /*-------------------------------------------------------------------------*/ /* USB_DT_CONFIG: Configuration descriptor information. * * USB_DT_OTHER_SPEED_CONFIG is the same descriptor, except that the * descriptor type is different. Highspeed-capable devices can look * different depending on what speed they're currently running. Only * devices with a USB_DT_DEVICE_QUALIFIER have any OTHER_SPEED_CONFIG * descriptors. */ struct usb_config_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 wTotalLength; __u8 bNumInterfaces; __u8 bConfigurationValue; __u8 iConfiguration; __u8 bmAttributes; __u8 bMaxPower; } __attribute__ ((packed)); #define USB_DT_CONFIG_SIZE 9 /* from config descriptor bmAttributes */ #define USB_CONFIG_ATT_ONE (1 << 7) /* must be set */ #define USB_CONFIG_ATT_SELFPOWER (1 << 6) /* self powered */ #define USB_CONFIG_ATT_WAKEUP (1 << 5) /* can wakeup */ #define USB_CONFIG_ATT_BATTERY (1 << 4) /* battery powered */ /*-------------------------------------------------------------------------*/ /* USB String descriptors can contain at most 126 characters. */ #define USB_MAX_STRING_LEN 126 /* USB_DT_STRING: String descriptor */ struct usb_string_descriptor { __u8 bLength; __u8 bDescriptorType; union { __le16 legacy_padding; __DECLARE_FLEX_ARRAY(__le16, wData); /* UTF-16LE encoded */ }; } __attribute__ ((packed)); /* note that "string" zero is special, it holds language codes that * the device supports, not Unicode characters. */ /*-------------------------------------------------------------------------*/ /* USB_DT_INTERFACE: Interface descriptor */ struct usb_interface_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bInterfaceNumber; __u8 bAlternateSetting; __u8 bNumEndpoints; __u8 bInterfaceClass; __u8 bInterfaceSubClass; __u8 bInterfaceProtocol; __u8 iInterface; } __attribute__ ((packed)); #define USB_DT_INTERFACE_SIZE 9 /*-------------------------------------------------------------------------*/ /* USB_DT_ENDPOINT: Endpoint descriptor */ struct usb_endpoint_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bEndpointAddress; __u8 bmAttributes; __le16 wMaxPacketSize; __u8 bInterval; /* NOTE: these two are _only_ in audio endpoints. */ /* use USB_DT_ENDPOINT*_SIZE in bLength, not sizeof. */ __u8 bRefresh; __u8 bSynchAddress; } __attribute__ ((packed)); #define USB_DT_ENDPOINT_SIZE 7 #define USB_DT_ENDPOINT_AUDIO_SIZE 9 /* Audio extension */ /* * Endpoints */ #define USB_ENDPOINT_NUMBER_MASK 0x0f /* in bEndpointAddress */ #define USB_ENDPOINT_DIR_MASK 0x80 #define USB_ENDPOINT_XFERTYPE_MASK 0x03 /* in bmAttributes */ #define USB_ENDPOINT_XFER_CONTROL 0 #define USB_ENDPOINT_XFER_ISOC 1 #define USB_ENDPOINT_XFER_BULK 2 #define USB_ENDPOINT_XFER_INT 3 #define USB_ENDPOINT_MAX_ADJUSTABLE 0x80 #define USB_ENDPOINT_MAXP_MASK 0x07ff #define USB_EP_MAXP_MULT_SHIFT 11 #define USB_EP_MAXP_MULT_MASK (3 << USB_EP_MAXP_MULT_SHIFT) #define USB_EP_MAXP_MULT(m) \ (((m) & USB_EP_MAXP_MULT_MASK) >> USB_EP_MAXP_MULT_SHIFT) /* The USB 3.0 spec redefines bits 5:4 of bmAttributes as interrupt ep type. */ #define USB_ENDPOINT_INTRTYPE 0x30 #define USB_ENDPOINT_INTR_PERIODIC (0 << 4) #define USB_ENDPOINT_INTR_NOTIFICATION (1 << 4) #define USB_ENDPOINT_SYNCTYPE 0x0c #define USB_ENDPOINT_SYNC_NONE (0 << 2) #define USB_ENDPOINT_SYNC_ASYNC (1 << 2) #define USB_ENDPOINT_SYNC_ADAPTIVE (2 << 2) #define USB_ENDPOINT_SYNC_SYNC (3 << 2) #define USB_ENDPOINT_USAGE_MASK 0x30 #define USB_ENDPOINT_USAGE_DATA 0x00 #define USB_ENDPOINT_USAGE_FEEDBACK 0x10 #define USB_ENDPOINT_USAGE_IMPLICIT_FB 0x20 /* Implicit feedback Data endpoint */ /*-------------------------------------------------------------------------*/ /** * usb_endpoint_num - get the endpoint's number * @epd: endpoint to be checked * * Returns @epd's number: 0 to 15. */ static inline int usb_endpoint_num(const struct usb_endpoint_descriptor *epd) { return epd->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK; } /** * usb_endpoint_type - get the endpoint's transfer type * @epd: endpoint to be checked * * Returns one of USB_ENDPOINT_XFER_{CONTROL, ISOC, BULK, INT} according * to @epd's transfer type. */ static inline int usb_endpoint_type(const struct usb_endpoint_descriptor *epd) { return epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK; } /** * usb_endpoint_dir_in - check if the endpoint has IN direction * @epd: endpoint to be checked * * Returns true if the endpoint is of type IN, otherwise it returns false. */ static inline int usb_endpoint_dir_in(const struct usb_endpoint_descriptor *epd) { return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN); } /** * usb_endpoint_dir_out - check if the endpoint has OUT direction * @epd: endpoint to be checked * * Returns true if the endpoint is of type OUT, otherwise it returns false. */ static inline int usb_endpoint_dir_out( const struct usb_endpoint_descriptor *epd) { return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT); } /** * usb_endpoint_xfer_bulk - check if the endpoint has bulk transfer type * @epd: endpoint to be checked * * Returns true if the endpoint is of type bulk, otherwise it returns false. */ static inline int usb_endpoint_xfer_bulk( const struct usb_endpoint_descriptor *epd) { return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_BULK); } /** * usb_endpoint_xfer_control - check if the endpoint has control transfer type * @epd: endpoint to be checked * * Returns true if the endpoint is of type control, otherwise it returns false. */ static inline int usb_endpoint_xfer_control( const struct usb_endpoint_descriptor *epd) { return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_CONTROL); } /** * usb_endpoint_xfer_int - check if the endpoint has interrupt transfer type * @epd: endpoint to be checked * * Returns true if the endpoint is of type interrupt, otherwise it returns * false. */ static inline int usb_endpoint_xfer_int( const struct usb_endpoint_descriptor *epd) { return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_INT); } /** * usb_endpoint_xfer_isoc - check if the endpoint has isochronous transfer type * @epd: endpoint to be checked * * Returns true if the endpoint is of type isochronous, otherwise it returns * false. */ static inline int usb_endpoint_xfer_isoc( const struct usb_endpoint_descriptor *epd) { return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_ISOC); } /** * usb_endpoint_is_bulk_in - check if the endpoint is bulk IN * @epd: endpoint to be checked * * Returns true if the endpoint has bulk transfer type and IN direction, * otherwise it returns false. */ static inline int usb_endpoint_is_bulk_in( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_in(epd); } /** * usb_endpoint_is_bulk_out - check if the endpoint is bulk OUT * @epd: endpoint to be checked * * Returns true if the endpoint has bulk transfer type and OUT direction, * otherwise it returns false. */ static inline int usb_endpoint_is_bulk_out( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_out(epd); } /** * usb_endpoint_is_int_in - check if the endpoint is interrupt IN * @epd: endpoint to be checked * * Returns true if the endpoint has interrupt transfer type and IN direction, * otherwise it returns false. */ static inline int usb_endpoint_is_int_in( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_int(epd) && usb_endpoint_dir_in(epd); } /** * usb_endpoint_is_int_out - check if the endpoint is interrupt OUT * @epd: endpoint to be checked * * Returns true if the endpoint has interrupt transfer type and OUT direction, * otherwise it returns false. */ static inline int usb_endpoint_is_int_out( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_int(epd) && usb_endpoint_dir_out(epd); } /** * usb_endpoint_is_isoc_in - check if the endpoint is isochronous IN * @epd: endpoint to be checked * * Returns true if the endpoint has isochronous transfer type and IN direction, * otherwise it returns false. */ static inline int usb_endpoint_is_isoc_in( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_in(epd); } /** * usb_endpoint_is_isoc_out - check if the endpoint is isochronous OUT * @epd: endpoint to be checked * * Returns true if the endpoint has isochronous transfer type and OUT direction, * otherwise it returns false. */ static inline int usb_endpoint_is_isoc_out( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_out(epd); } /** * usb_endpoint_maxp - get endpoint's max packet size * @epd: endpoint to be checked * * Returns @epd's max packet bits [10:0] */ static inline int usb_endpoint_maxp(const struct usb_endpoint_descriptor *epd) { return __le16_to_cpu(epd->wMaxPacketSize) & USB_ENDPOINT_MAXP_MASK; } /** * usb_endpoint_maxp_mult - get endpoint's transactional opportunities * @epd: endpoint to be checked * * Return @epd's wMaxPacketSize[12:11] + 1 */ static inline int usb_endpoint_maxp_mult(const struct usb_endpoint_descriptor *epd) { int maxp = __le16_to_cpu(epd->wMaxPacketSize); return USB_EP_MAXP_MULT(maxp) + 1; } static inline int usb_endpoint_interrupt_type( const struct usb_endpoint_descriptor *epd) { return epd->bmAttributes & USB_ENDPOINT_INTRTYPE; } /*-------------------------------------------------------------------------*/ /* USB_DT_EUSB2_ISOC_ENDPOINT_COMP: eUSB2 Isoch Endpoint Companion descriptor */ struct usb_eusb2_isoc_ep_comp_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 wMaxPacketSize; __le32 dwBytesPerInterval; } __attribute__ ((packed)); #define USB_DT_EUSB2_ISOC_EP_COMP_SIZE 8 /*-------------------------------------------------------------------------*/ /* USB_DT_SSP_ISOC_ENDPOINT_COMP: SuperSpeedPlus Isochronous Endpoint Companion * descriptor */ struct usb_ssp_isoc_ep_comp_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 wReseved; __le32 dwBytesPerInterval; } __attribute__ ((packed)); #define USB_DT_SSP_ISOC_EP_COMP_SIZE 8 /*-------------------------------------------------------------------------*/ /* USB_DT_SS_ENDPOINT_COMP: SuperSpeed Endpoint Companion descriptor */ struct usb_ss_ep_comp_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bMaxBurst; __u8 bmAttributes; __le16 wBytesPerInterval; } __attribute__ ((packed)); #define USB_DT_SS_EP_COMP_SIZE 6 /* Bits 4:0 of bmAttributes if this is a bulk endpoint */ static inline int usb_ss_max_streams(const struct usb_ss_ep_comp_descriptor *comp) { int max_streams; if (!comp) return 0; max_streams = comp->bmAttributes & 0x1f; if (!max_streams) return 0; max_streams = 1 << max_streams; return max_streams; } /* Bits 1:0 of bmAttributes if this is an isoc endpoint */ #define USB_SS_MULT(p) (1 + ((p) & 0x3)) /* Bit 7 of bmAttributes if a SSP isoc endpoint companion descriptor exists */ #define USB_SS_SSP_ISOC_COMP(p) ((p) & (1 << 7)) /*-------------------------------------------------------------------------*/ /* USB_DT_DEVICE_QUALIFIER: Device Qualifier descriptor */ struct usb_qualifier_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 bcdUSB; __u8 bDeviceClass; __u8 bDeviceSubClass; __u8 bDeviceProtocol; __u8 bMaxPacketSize0; __u8 bNumConfigurations; __u8 bRESERVED; } __attribute__ ((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_OTG (from OTG 1.0a supplement) */ struct usb_otg_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bmAttributes; /* support for HNP, SRP, etc */ } __attribute__ ((packed)); /* USB_DT_OTG (from OTG 2.0 supplement) */ struct usb_otg20_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bmAttributes; /* support for HNP, SRP and ADP, etc */ __le16 bcdOTG; /* OTG and EH supplement release number * in binary-coded decimal(i.e. 2.0 is 0200H) */ } __attribute__ ((packed)); /* from usb_otg_descriptor.bmAttributes */ #define USB_OTG_SRP (1 << 0) #define USB_OTG_HNP (1 << 1) /* swap host/device roles */ #define USB_OTG_ADP (1 << 2) /* support ADP */ /* OTG 3.0 */ #define USB_OTG_RSP (1 << 3) /* support RSP */ #define OTG_STS_SELECTOR 0xF000 /* OTG status selector */ /*-------------------------------------------------------------------------*/ /* USB_DT_DEBUG: for special highspeed devices, replacing serial console */ struct usb_debug_descriptor { __u8 bLength; __u8 bDescriptorType; /* bulk endpoints with 8 byte maxpacket */ __u8 bDebugInEndpoint; __u8 bDebugOutEndpoint; } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_INTERFACE_ASSOCIATION: groups interfaces */ struct usb_interface_assoc_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bFirstInterface; __u8 bInterfaceCount; __u8 bFunctionClass; __u8 bFunctionSubClass; __u8 bFunctionProtocol; __u8 iFunction; } __attribute__ ((packed)); #define USB_DT_INTERFACE_ASSOCIATION_SIZE 8 /*-------------------------------------------------------------------------*/ /* USB_DT_SECURITY: group of wireless security descriptors, including * encryption types available for setting up a CC/association. */ struct usb_security_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 wTotalLength; __u8 bNumEncryptionTypes; } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_KEY: used with {GET,SET}_SECURITY_DATA; only public keys * may be retrieved. */ struct usb_key_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 tTKID[3]; __u8 bReserved; __u8 bKeyData[]; } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_ENCRYPTION_TYPE: bundled in DT_SECURITY groups */ struct usb_encryption_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bEncryptionType; #define USB_ENC_TYPE_UNSECURE 0 #define USB_ENC_TYPE_WIRED 1 /* non-wireless mode */ #define USB_ENC_TYPE_CCM_1 2 /* aes128/cbc session */ #define USB_ENC_TYPE_RSA_1 3 /* rsa3072/sha1 auth */ __u8 bEncryptionValue; /* use in SET_ENCRYPTION */ __u8 bAuthKeyIndex; } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_BOS: group of device-level capabilities */ struct usb_bos_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 wTotalLength; __u8 bNumDeviceCaps; } __attribute__((packed)); #define USB_DT_BOS_SIZE 5 /*-------------------------------------------------------------------------*/ /* USB_DT_DEVICE_CAPABILITY: grouped with BOS */ struct usb_dev_cap_header { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; } __attribute__((packed)); #define USB_CAP_TYPE_WIRELESS_USB 1 struct usb_wireless_cap_descriptor { /* Ultra Wide Band */ __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bmAttributes; #define USB_WIRELESS_P2P_DRD (1 << 1) #define USB_WIRELESS_BEACON_MASK (3 << 2) #define USB_WIRELESS_BEACON_SELF (1 << 2) #define USB_WIRELESS_BEACON_DIRECTED (2 << 2) #define USB_WIRELESS_BEACON_NONE (3 << 2) __le16 wPHYRates; /* bit rates, Mbps */ #define USB_WIRELESS_PHY_53 (1 << 0) /* always set */ #define USB_WIRELESS_PHY_80 (1 << 1) #define USB_WIRELESS_PHY_107 (1 << 2) /* always set */ #define USB_WIRELESS_PHY_160 (1 << 3) #define USB_WIRELESS_PHY_200 (1 << 4) /* always set */ #define USB_WIRELESS_PHY_320 (1 << 5) #define USB_WIRELESS_PHY_400 (1 << 6) #define USB_WIRELESS_PHY_480 (1 << 7) __u8 bmTFITXPowerInfo; /* TFI power levels */ __u8 bmFFITXPowerInfo; /* FFI power levels */ __le16 bmBandGroup; __u8 bReserved; } __attribute__((packed)); #define USB_DT_USB_WIRELESS_CAP_SIZE 11 /* USB 2.0 Extension descriptor */ #define USB_CAP_TYPE_EXT 2 struct usb_ext_cap_descriptor { /* Link Power Management */ __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __le32 bmAttributes; #define USB_LPM_SUPPORT (1 << 1) /* supports LPM */ #define USB_BESL_SUPPORT (1 << 2) /* supports BESL */ #define USB_BESL_BASELINE_VALID (1 << 3) /* Baseline BESL valid*/ #define USB_BESL_DEEP_VALID (1 << 4) /* Deep BESL valid */ #define USB_SET_BESL_BASELINE(p) (((p) & 0xf) << 8) #define USB_SET_BESL_DEEP(p) (((p) & 0xf) << 12) #define USB_GET_BESL_BASELINE(p) (((p) & (0xf << 8)) >> 8) #define USB_GET_BESL_DEEP(p) (((p) & (0xf << 12)) >> 12) } __attribute__((packed)); #define USB_DT_USB_EXT_CAP_SIZE 7 /* * SuperSpeed USB Capability descriptor: Defines the set of SuperSpeed USB * specific device level capabilities */ #define USB_SS_CAP_TYPE 3 struct usb_ss_cap_descriptor { /* Link Power Management */ __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bmAttributes; #define USB_LTM_SUPPORT (1 << 1) /* supports LTM */ __le16 wSpeedSupported; #define USB_LOW_SPEED_OPERATION (1) /* Low speed operation */ #define USB_FULL_SPEED_OPERATION (1 << 1) /* Full speed operation */ #define USB_HIGH_SPEED_OPERATION (1 << 2) /* High speed operation */ #define USB_5GBPS_OPERATION (1 << 3) /* Operation at 5Gbps */ __u8 bFunctionalitySupport; __u8 bU1devExitLat; __le16 bU2DevExitLat; } __attribute__((packed)); #define USB_DT_USB_SS_CAP_SIZE 10 /* * Container ID Capability descriptor: Defines the instance unique ID used to * identify the instance across all operating modes */ #define CONTAINER_ID_TYPE 4 struct usb_ss_container_id_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bReserved; __u8 ContainerID[16]; /* 128-bit number */ } __attribute__((packed)); #define USB_DT_USB_SS_CONTN_ID_SIZE 20 /* * Platform Device Capability descriptor: Defines platform specific device * capabilities */ #define USB_PLAT_DEV_CAP_TYPE 5 struct usb_plat_dev_cap_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bReserved; __u8 UUID[16]; __u8 CapabilityData[]; } __attribute__((packed)); #define USB_DT_USB_PLAT_DEV_CAP_SIZE(capability_data_size) (20 + capability_data_size) /* * SuperSpeed Plus USB Capability descriptor: Defines the set of * SuperSpeed Plus USB specific device level capabilities */ #define USB_SSP_CAP_TYPE 0xa struct usb_ssp_cap_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bReserved; __le32 bmAttributes; #define USB_SSP_SUBLINK_SPEED_ATTRIBS (0x1f << 0) /* sublink speed entries */ #define USB_SSP_SUBLINK_SPEED_IDS (0xf << 5) /* speed ID entries */ __le16 wFunctionalitySupport; #define USB_SSP_MIN_SUBLINK_SPEED_ATTRIBUTE_ID (0xf) #define USB_SSP_MIN_RX_LANE_COUNT (0xf << 8) #define USB_SSP_MIN_TX_LANE_COUNT (0xf << 12) __le16 wReserved; union { __le32 legacy_padding; /* list of sublink speed attrib entries */ __DECLARE_FLEX_ARRAY(__le32, bmSublinkSpeedAttr); }; #define USB_SSP_SUBLINK_SPEED_SSID (0xf) /* sublink speed ID */ #define USB_SSP_SUBLINK_SPEED_LSE (0x3 << 4) /* Lanespeed exponent */ #define USB_SSP_SUBLINK_SPEED_LSE_BPS 0 #define USB_SSP_SUBLINK_SPEED_LSE_KBPS 1 #define USB_SSP_SUBLINK_SPEED_LSE_MBPS 2 #define USB_SSP_SUBLINK_SPEED_LSE_GBPS 3 #define USB_SSP_SUBLINK_SPEED_ST (0x3 << 6) /* Sublink type */ #define USB_SSP_SUBLINK_SPEED_ST_SYM_RX 0 #define USB_SSP_SUBLINK_SPEED_ST_ASYM_RX 1 #define USB_SSP_SUBLINK_SPEED_ST_SYM_TX 2 #define USB_SSP_SUBLINK_SPEED_ST_ASYM_TX 3 #define USB_SSP_SUBLINK_SPEED_RSVD (0x3f << 8) /* Reserved */ #define USB_SSP_SUBLINK_SPEED_LP (0x3 << 14) /* Link protocol */ #define USB_SSP_SUBLINK_SPEED_LP_SS 0 #define USB_SSP_SUBLINK_SPEED_LP_SSP 1 #define USB_SSP_SUBLINK_SPEED_LSM (0xff << 16) /* Lanespeed mantissa */ } __attribute__((packed)); /* * USB Power Delivery Capability Descriptor: * Defines capabilities for PD */ /* Defines the various PD Capabilities of this device */ #define USB_PD_POWER_DELIVERY_CAPABILITY 0x06 /* Provides information on each battery supported by the device */ #define USB_PD_BATTERY_INFO_CAPABILITY 0x07 /* The Consumer characteristics of a Port on the device */ #define USB_PD_PD_CONSUMER_PORT_CAPABILITY 0x08 /* The provider characteristics of a Port on the device */ #define USB_PD_PD_PROVIDER_PORT_CAPABILITY 0x09 struct usb_pd_cap_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; /* set to USB_PD_POWER_DELIVERY_CAPABILITY */ __u8 bReserved; __le32 bmAttributes; #define USB_PD_CAP_BATTERY_CHARGING (1 << 1) /* supports Battery Charging specification */ #define USB_PD_CAP_USB_PD (1 << 2) /* supports USB Power Delivery specification */ #define USB_PD_CAP_PROVIDER (1 << 3) /* can provide power */ #define USB_PD_CAP_CONSUMER (1 << 4) /* can consume power */ #define USB_PD_CAP_CHARGING_POLICY (1 << 5) /* supports CHARGING_POLICY feature */ #define USB_PD_CAP_TYPE_C_CURRENT (1 << 6) /* supports power capabilities defined in the USB Type-C Specification */ #define USB_PD_CAP_PWR_AC (1 << 8) #define USB_PD_CAP_PWR_BAT (1 << 9) #define USB_PD_CAP_PWR_USE_V_BUS (1 << 14) __le16 bmProviderPorts; /* Bit zero refers to the UFP of the device */ __le16 bmConsumerPorts; __le16 bcdBCVersion; __le16 bcdPDVersion; __le16 bcdUSBTypeCVersion; } __attribute__((packed)); struct usb_pd_cap_battery_info_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; /* Index of string descriptor shall contain the user friendly name for this battery */ __u8 iBattery; /* Index of string descriptor shall contain the Serial Number String for this battery */ __u8 iSerial; __u8 iManufacturer; __u8 bBatteryId; /* uniquely identifies this battery in status Messages */ __u8 bReserved; /* * Shall contain the Battery Charge value above which this * battery is considered to be fully charged but not necessarily * “topped off.” */ __le32 dwChargedThreshold; /* in mWh */ /* * Shall contain the minimum charge level of this battery such * that above this threshold, a device can be assured of being * able to power up successfully (see Battery Charging 1.2). */ __le32 dwWeakThreshold; /* in mWh */ __le32 dwBatteryDesignCapacity; /* in mWh */ __le32 dwBatteryLastFullchargeCapacity; /* in mWh */ } __attribute__((packed)); struct usb_pd_cap_consumer_port_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bReserved; __u8 bmCapabilities; /* port will oerate under: */ #define USB_PD_CAP_CONSUMER_BC (1 << 0) /* BC */ #define USB_PD_CAP_CONSUMER_PD (1 << 1) /* PD */ #define USB_PD_CAP_CONSUMER_TYPE_C (1 << 2) /* USB Type-C Current */ __le16 wMinVoltage; /* in 50mV units */ __le16 wMaxVoltage; /* in 50mV units */ __u16 wReserved; __le32 dwMaxOperatingPower; /* in 10 mW - operating at steady state */ __le32 dwMaxPeakPower; /* in 10mW units - operating at peak power */ __le32 dwMaxPeakPowerTime; /* in 100ms units - duration of peak */ #define USB_PD_CAP_CONSUMER_UNKNOWN_PEAK_POWER_TIME 0xffff } __attribute__((packed)); struct usb_pd_cap_provider_port_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bReserved1; __u8 bmCapabilities; /* port will oerate under: */ #define USB_PD_CAP_PROVIDER_BC (1 << 0) /* BC */ #define USB_PD_CAP_PROVIDER_PD (1 << 1) /* PD */ #define USB_PD_CAP_PROVIDER_TYPE_C (1 << 2) /* USB Type-C Current */ __u8 bNumOfPDObjects; __u8 bReserved2; __le32 wPowerDataObject[]; } __attribute__((packed)); /* * Precision time measurement capability descriptor: advertised by devices and * hubs that support PTM */ #define USB_PTM_CAP_TYPE 0xb struct usb_ptm_cap_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; } __attribute__((packed)); #define USB_DT_USB_PTM_ID_SIZE 3 /* * The size of the descriptor for the Sublink Speed Attribute Count * (SSAC) specified in bmAttributes[4:0]. SSAC is zero-based */ #define USB_DT_USB_SSP_CAP_SIZE(ssac) (12 + (ssac + 1) * 4) /*-------------------------------------------------------------------------*/ /* USB_DT_WIRELESS_ENDPOINT_COMP: companion descriptor associated with * each endpoint descriptor for a wireless device */ struct usb_wireless_ep_comp_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bMaxBurst; __u8 bMaxSequence; __le16 wMaxStreamDelay; __le16 wOverTheAirPacketSize; __u8 bOverTheAirInterval; __u8 bmCompAttributes; #define USB_ENDPOINT_SWITCH_MASK 0x03 /* in bmCompAttributes */ #define USB_ENDPOINT_SWITCH_NO 0 #define USB_ENDPOINT_SWITCH_SWITCH 1 #define USB_ENDPOINT_SWITCH_SCALE 2 } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_REQ_SET_HANDSHAKE is a four-way handshake used between a wireless * host and a device for connection set up, mutual authentication, and * exchanging short lived session keys. The handshake depends on a CC. */ struct usb_handshake { __u8 bMessageNumber; __u8 bStatus; __u8 tTKID[3]; __u8 bReserved; __u8 CDID[16]; __u8 nonce[16]; __u8 MIC[8]; } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_REQ_SET_CONNECTION modifies or revokes a connection context (CC). * A CC may also be set up using non-wireless secure channels (including * wired USB!), and some devices may support CCs with multiple hosts. */ struct usb_connection_context { __u8 CHID[16]; /* persistent host id */ __u8 CDID[16]; /* device id (unique w/in host context) */ __u8 CK[16]; /* connection key */ } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB 2.0 defines three speeds, here's how Linux identifies them */ enum usb_device_speed { USB_SPEED_UNKNOWN = 0, /* enumerating */ USB_SPEED_LOW, USB_SPEED_FULL, /* usb 1.1 */ USB_SPEED_HIGH, /* usb 2.0 */ USB_SPEED_WIRELESS, /* wireless (usb 2.5) */ USB_SPEED_SUPER, /* usb 3.0 */ USB_SPEED_SUPER_PLUS, /* usb 3.1 */ }; enum usb_device_state { /* NOTATTACHED isn't in the USB spec, and this state acts * the same as ATTACHED ... but it's clearer this way. */ USB_STATE_NOTATTACHED = 0, /* chapter 9 and authentication (wireless) device states */ USB_STATE_ATTACHED, USB_STATE_POWERED, /* wired */ USB_STATE_RECONNECTING, /* auth */ USB_STATE_UNAUTHENTICATED, /* auth */ USB_STATE_DEFAULT, /* limited function */ USB_STATE_ADDRESS, USB_STATE_CONFIGURED, /* most functions */ USB_STATE_SUSPENDED /* NOTE: there are actually four different SUSPENDED * states, returning to POWERED, DEFAULT, ADDRESS, or * CONFIGURED respectively when SOF tokens flow again. * At this level there's no difference between L1 and L2 * suspend states. (L2 being original USB 1.1 suspend.) */ }; enum usb3_link_state { USB3_LPM_U0 = 0, USB3_LPM_U1, USB3_LPM_U2, USB3_LPM_U3 }; /* * A U1 timeout of 0x0 means the parent hub will reject any transitions to U1. * 0xff means the parent hub will accept transitions to U1, but will not * initiate a transition. * * A U1 timeout of 0x1 to 0x7F also causes the hub to initiate a transition to * U1 after that many microseconds. Timeouts of 0x80 to 0xFE are reserved * values. * * A U2 timeout of 0x0 means the parent hub will reject any transitions to U2. * 0xff means the parent hub will accept transitions to U2, but will not * initiate a transition. * * A U2 timeout of 0x1 to 0xFE also causes the hub to initiate a transition to * U2 after N*256 microseconds. Therefore a U2 timeout value of 0x1 means a U2 * idle timer of 256 microseconds, 0x2 means 512 microseconds, 0xFE means * 65.024ms. */ #define USB3_LPM_DISABLED 0x0 #define USB3_LPM_U1_MAX_TIMEOUT 0x7F #define USB3_LPM_U2_MAX_TIMEOUT 0xFE #define USB3_LPM_DEVICE_INITIATED 0xFF struct usb_set_sel_req { __u8 u1_sel; __u8 u1_pel; __le16 u2_sel; __le16 u2_pel; } __attribute__ ((packed)); /* * The Set System Exit Latency control transfer provides one byte each for * U1 SEL and U1 PEL, so the max exit latency is 0xFF. U2 SEL and U2 PEL each * are two bytes long. */ #define USB3_LPM_MAX_U1_SEL_PEL 0xFF #define USB3_LPM_MAX_U2_SEL_PEL 0xFFFF /*-------------------------------------------------------------------------*/ /* * As per USB compliance update, a device that is actively drawing * more than 100mA from USB must report itself as bus-powered in * the GetStatus(DEVICE) call. * https://compliance.usb.org/index.asp?UpdateFile=Electrical&Format=Standard#34 */ #define USB_SELF_POWER_VBUS_MAX_DRAW 100 #endif /* _UAPI__LINUX_USB_CH9_H */
66 64 48 64 64 66 66 66 66 66 66 66 64 48 48 48 64 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation */ #include <linux/types.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <asm/cpufeature.h> #include <asm/processor.h> static __always_inline u64 eq_mask(u64 a, u64 b) { u64 x = a ^ b; u64 minus_x = ~x + (u64)1U; u64 x_or_minus_x = x | minus_x; u64 xnx = x_or_minus_x >> (u32)63U; return xnx - (u64)1U; } static __always_inline u64 gte_mask(u64 a, u64 b) { u64 x = a; u64 y = b; u64 x_xor_y = x ^ y; u64 x_sub_y = x - y; u64 x_sub_y_xor_y = x_sub_y ^ y; u64 q = x_xor_y | x_sub_y_xor_y; u64 x_xor_q = x ^ q; u64 x_xor_q_ = x_xor_q >> (u32)63U; return x_xor_q_ - (u64)1U; } /* Computes the addition of four-element f1 with value in f2 * and returns the carry (if any) */ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2) { u64 carry_r; asm volatile( /* Clear registers to propagate the carry bit */ " xor %%r8d, %%r8d;" " xor %%r9d, %%r9d;" " xor %%r10d, %%r10d;" " xor %%r11d, %%r11d;" " xor %k1, %k1;" /* Begin addition chain */ " addq 0(%3), %0;" " movq %0, 0(%2);" " adcxq 8(%3), %%r8;" " movq %%r8, 8(%2);" " adcxq 16(%3), %%r9;" " movq %%r9, 16(%2);" " adcxq 24(%3), %%r10;" " movq %%r10, 24(%2);" /* Return the carry bit in a register */ " adcx %%r11, %1;" : "+&r"(f2), "=&r"(carry_r) : "r"(out), "r"(f1) : "%r8", "%r9", "%r10", "%r11", "memory", "cc"); return carry_r; } /* Computes the field addition of two field elements */ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) { asm volatile( /* Compute the raw addition of f1 + f2 */ " movq 0(%0), %%r8;" " addq 0(%2), %%r8;" " movq 8(%0), %%r9;" " adcxq 8(%2), %%r9;" " movq 16(%0), %%r10;" " adcxq 16(%2), %%r10;" " movq 24(%0), %%r11;" " adcxq 24(%2), %%r11;" /* Wrap the result back into the field */ /* Step 1: Compute carry*38 */ " mov $0, %%rax;" " mov $38, %0;" " cmovc %0, %%rax;" /* Step 2: Add carry*38 to the original sum */ " xor %%ecx, %%ecx;" " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %0, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" : "+&r"(f2) : "r"(out), "r"(f1) : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); } /* Computes the field subtraction of two field elements */ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) { asm volatile( /* Compute the raw subtraction of f1-f2 */ " movq 0(%1), %%r8;" " subq 0(%2), %%r8;" " movq 8(%1), %%r9;" " sbbq 8(%2), %%r9;" " movq 16(%1), %%r10;" " sbbq 16(%2), %%r10;" " movq 24(%1), %%r11;" " sbbq 24(%2), %%r11;" /* Wrap the result back into the field */ /* Step 1: Compute carry*38 */ " mov $0, %%rax;" " mov $38, %%rcx;" " cmovc %%rcx, %%rax;" /* Step 2: Subtract carry*38 from the original difference */ " sub %%rax, %%r8;" " sbb $0, %%r9;" " sbb $0, %%r10;" " sbb $0, %%r11;" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rcx, %%rax;" " sub %%rax, %%r8;" /* Store the result */ " movq %%r8, 0(%0);" " movq %%r9, 8(%0);" " movq %%r10, 16(%0);" " movq %%r11, 24(%0);" : : "r"(out), "r"(f1), "r"(f2) : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); } /* Computes a field multiplication: out <- f1 * f2 * Uses the 8-element buffer tmp for intermediate results */ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) { asm volatile( /* Compute the raw multiplication: tmp <- src1 * src2 */ /* Compute src1[0] * src2 */ " movq 0(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 8(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%2), %%r8;" " movq %%r8, 8(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 16(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%2), %%r8;" " movq %%r8, 16(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 24(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%2), %%r8;" " movq %%r8, 24(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%2);" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%2);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%2);" /* Line up pointers */ " mov %2, %0;" " mov %3, %2;" /* Wrap the result back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%0), %%r8, %%r13;" " xor %k1, %k1;" " adoxq 0(%0), %%r8;" " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 8(%0), %%r9;" " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 16(%0), %%r10;" " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%0), %%r11;" " adcx %1, %%rax;" " adox %1, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %1, %%r9;" " movq %%r9, 8(%2);" " adcx %1, %%r10;" " movq %%r10, 16(%2);" " adcx %1, %%r11;" " movq %%r11, 24(%2);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%2);" : "+&r"(f1), "+&r"(f2), "+&r"(tmp) : "r"(out) : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "memory", "cc"); } /* Computes two field multiplications: * out[0] <- f1[0] * f2[0] * out[1] <- f1[1] * f2[1] * Uses the 16-element buffer tmp for intermediate results: */ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) { asm volatile( /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */ /* Compute src1[0] * src2 */ " movq 0(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 8(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%2), %%r8;" " movq %%r8, 8(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 16(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%2), %%r8;" " movq %%r8, 16(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 24(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%2), %%r8;" " movq %%r8, 24(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%2);" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%2);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%2);" /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */ /* Compute src1[0] * src2 */ " movq 32(%0), %%rdx;" " mulxq 32(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%2);" " mulxq 40(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%2);" " mulxq 48(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 56(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 40(%0), %%rdx;" " mulxq 32(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%2), %%r8;" " movq %%r8, 72(%2);" " mulxq 40(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%2);" " mulxq 48(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 56(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 48(%0), %%rdx;" " mulxq 32(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%2), %%r8;" " movq %%r8, 80(%2);" " mulxq 40(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%2);" " mulxq 48(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 56(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 56(%0), %%rdx;" " mulxq 32(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%2), %%r8;" " movq %%r8, 88(%2);" " mulxq 40(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%2);" " mulxq 48(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%2);" " mov $0, %%r8;" " mulxq 56(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%2);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%2);" /* Line up pointers */ " mov %2, %0;" " mov %3, %2;" /* Wrap the results back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%0), %%r8, %%r13;" " xor %k1, %k1;" " adoxq 0(%0), %%r8;" " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 8(%0), %%r9;" " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 16(%0), %%r10;" " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%0), %%r11;" " adcx %1, %%rax;" " adox %1, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %1, %%r9;" " movq %%r9, 8(%2);" " adcx %1, %%r10;" " movq %%r10, 16(%2);" " adcx %1, %%r11;" " movq %%r11, 24(%2);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%2);" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 96(%0), %%r8, %%r13;" " xor %k1, %k1;" " adoxq 64(%0), %%r8;" " mulxq 104(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 72(%0), %%r9;" " mulxq 112(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 80(%0), %%r10;" " mulxq 120(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 88(%0), %%r11;" " adcx %1, %%rax;" " adox %1, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %1, %%r9;" " movq %%r9, 40(%2);" " adcx %1, %%r10;" " movq %%r10, 48(%2);" " adcx %1, %%r11;" " movq %%r11, 56(%2);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 32(%2);" : "+&r"(f1), "+&r"(f2), "+&r"(tmp) : "r"(out) : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "memory", "cc"); } /* Computes the field multiplication of four-element f1 with value in f2 * Requires f2 to be smaller than 2^17 */ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2) { register u64 f2_r asm("rdx") = f2; asm volatile( /* Compute the raw multiplication of f1*f2 */ " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */ " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */ " add %%rcx, %%r9;" " mov $0, %%rcx;" " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */ " adcx %%rbx, %%r10;" " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */ " adcx %%r13, %%r11;" " adcx %%rcx, %%rax;" /* Wrap the result back into the field */ /* Step 1: Compute carry*38 */ " mov $38, %%rdx;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" : "+&r"(f2_r) : "r"(out), "r"(f1) : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13", "memory", "cc"); } /* Computes p1 <- bit ? p2 : p1 in constant time */ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) { asm volatile( /* Transfer bit into CF flag */ " add $18446744073709551615, %0;" /* cswap p1[0], p2[0] */ " movq 0(%1), %%r8;" " movq 0(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 0(%1);" " movq %%r9, 0(%2);" /* cswap p1[1], p2[1] */ " movq 8(%1), %%r8;" " movq 8(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 8(%1);" " movq %%r9, 8(%2);" /* cswap p1[2], p2[2] */ " movq 16(%1), %%r8;" " movq 16(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 16(%1);" " movq %%r9, 16(%2);" /* cswap p1[3], p2[3] */ " movq 24(%1), %%r8;" " movq 24(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 24(%1);" " movq %%r9, 24(%2);" /* cswap p1[4], p2[4] */ " movq 32(%1), %%r8;" " movq 32(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 32(%1);" " movq %%r9, 32(%2);" /* cswap p1[5], p2[5] */ " movq 40(%1), %%r8;" " movq 40(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 40(%1);" " movq %%r9, 40(%2);" /* cswap p1[6], p2[6] */ " movq 48(%1), %%r8;" " movq 48(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 48(%1);" " movq %%r9, 48(%2);" /* cswap p1[7], p2[7] */ " movq 56(%1), %%r8;" " movq 56(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 56(%1);" " movq %%r9, 56(%2);" : "+&r"(bit) : "r"(p1), "r"(p2) : "%r8", "%r9", "%r10", "memory", "cc"); } /* Computes the square of a field element: out <- f * f * Uses the 8-element buffer tmp for intermediate results */ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) { asm volatile( /* Compute the raw multiplication: tmp <- f * f */ /* Step 1: Compute all partial products */ " movq 0(%0), %%rdx;" /* f[0] */ " mulxq 8(%0), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ " mulxq 16(%0), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 24(%0), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 24(%0), %%rdx;" /* f[3] */ " mulxq 8(%0), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ " mulxq 16(%0), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ " movq 8(%0), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ " mulxq 16(%0), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15d, %%r15d;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" " adcx %%r9, %%r9;" " adox %%r15, %%rbx;" " adcx %%r10, %%r10;" " adox %%r15, %%r13;" " adcx %%r11, %%r11;" " adox %%r15, %%r14;" " adcx %%rbx, %%rbx;" " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ " movq 0(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 0(%1);" " add %%rcx, %%r8;" " movq %%r8, 8(%1);" " movq 8(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 16(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 24(%1);" " movq 16(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 32(%1);" " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%1);" " movq 24(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 48(%1);" " adcx %%rcx, %%r14;" " movq %%r14, 56(%1);" /* Line up pointers */ " mov %1, %0;" " mov %2, %1;" /* Wrap the result back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%0), %%r8, %%r13;" " xor %%ecx, %%ecx;" " adoxq 0(%0), %%r8;" " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 8(%0), %%r9;" " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 16(%0), %%r10;" " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%0), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" : "+&r"(f), "+&r"(tmp) : "r"(out) : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "%r15", "memory", "cc"); } /* Computes two field squarings: * out[0] <- f[0] * f[0] * out[1] <- f[1] * f[1] * Uses the 16-element buffer tmp for intermediate results */ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) { asm volatile( /* Step 1: Compute all partial products */ " movq 0(%0), %%rdx;" /* f[0] */ " mulxq 8(%0), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ " mulxq 16(%0), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 24(%0), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 24(%0), %%rdx;" /* f[3] */ " mulxq 8(%0), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ " mulxq 16(%0), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ " movq 8(%0), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ " mulxq 16(%0), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15d, %%r15d;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" " adcx %%r9, %%r9;" " adox %%r15, %%rbx;" " adcx %%r10, %%r10;" " adox %%r15, %%r13;" " adcx %%r11, %%r11;" " adox %%r15, %%r14;" " adcx %%rbx, %%rbx;" " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ " movq 0(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 0(%1);" " add %%rcx, %%r8;" " movq %%r8, 8(%1);" " movq 8(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 16(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 24(%1);" " movq 16(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 32(%1);" " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%1);" " movq 24(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 48(%1);" " adcx %%rcx, %%r14;" " movq %%r14, 56(%1);" /* Step 1: Compute all partial products */ " movq 32(%0), %%rdx;" /* f[0] */ " mulxq 40(%0), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ " mulxq 48(%0), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 56(%0), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 56(%0), %%rdx;" /* f[3] */ " mulxq 40(%0), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ " mulxq 48(%0), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ " movq 40(%0), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ " mulxq 48(%0), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15d, %%r15d;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" " adcx %%r9, %%r9;" " adox %%r15, %%rbx;" " adcx %%r10, %%r10;" " adox %%r15, %%r13;" " adcx %%r11, %%r11;" " adox %%r15, %%r14;" " adcx %%rbx, %%rbx;" " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ " movq 32(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 64(%1);" " add %%rcx, %%r8;" " movq %%r8, 72(%1);" " movq 40(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 80(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 88(%1);" " movq 48(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 96(%1);" " adcx %%rcx, %%rbx;" " movq %%rbx, 104(%1);" " movq 56(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 112(%1);" " adcx %%rcx, %%r14;" " movq %%r14, 120(%1);" /* Line up pointers */ " mov %1, %0;" " mov %2, %1;" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%0), %%r8, %%r13;" " xor %%ecx, %%ecx;" " adoxq 0(%0), %%r8;" " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 8(%0), %%r9;" " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 16(%0), %%r10;" " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%0), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 96(%0), %%r8, %%r13;" " xor %%ecx, %%ecx;" " adoxq 64(%0), %%r8;" " mulxq 104(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 72(%0), %%r9;" " mulxq 112(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 80(%0), %%r10;" " mulxq 120(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 88(%0), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 40(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 48(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 56(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 32(%1);" : "+&r"(f), "+&r"(tmp) : "r"(out) : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "%r15", "memory", "cc"); } static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2) { u64 *nq = p01_tmp1; u64 *nq_p1 = p01_tmp1 + (u32)8U; u64 *tmp1 = p01_tmp1 + (u32)16U; u64 *x1 = q; u64 *x2 = nq; u64 *z2 = nq + (u32)4U; u64 *z3 = nq_p1 + (u32)4U; u64 *a = tmp1; u64 *b = tmp1 + (u32)4U; u64 *ab = tmp1; u64 *dc = tmp1 + (u32)8U; u64 *x3; u64 *z31; u64 *d0; u64 *c0; u64 *a1; u64 *b1; u64 *d; u64 *c; u64 *ab1; u64 *dc1; fadd(a, x2, z2); fsub(b, x2, z2); x3 = nq_p1; z31 = nq_p1 + (u32)4U; d0 = dc; c0 = dc + (u32)4U; fadd(c0, x3, z31); fsub(d0, x3, z31); fmul2(dc, dc, ab, tmp2); fadd(x3, d0, c0); fsub(z31, d0, c0); a1 = tmp1; b1 = tmp1 + (u32)4U; d = tmp1 + (u32)8U; c = tmp1 + (u32)12U; ab1 = tmp1; dc1 = tmp1 + (u32)8U; fsqr2(dc1, ab1, tmp2); fsqr2(nq_p1, nq_p1, tmp2); a1[0U] = c[0U]; a1[1U] = c[1U]; a1[2U] = c[2U]; a1[3U] = c[3U]; fsub(c, d, c); fmul_scalar(b1, c, (u64)121665U); fadd(b1, b1, d); fmul2(nq, dc1, ab1, tmp2); fmul(z3, z3, x1, tmp2); } static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2) { u64 *x2 = nq; u64 *z2 = nq + (u32)4U; u64 *a = tmp1; u64 *b = tmp1 + (u32)4U; u64 *d = tmp1 + (u32)8U; u64 *c = tmp1 + (u32)12U; u64 *ab = tmp1; u64 *dc = tmp1 + (u32)8U; fadd(a, x2, z2); fsub(b, x2, z2); fsqr2(dc, ab, tmp2); a[0U] = c[0U]; a[1U] = c[1U]; a[2U] = c[2U]; a[3U] = c[3U]; fsub(c, d, c); fmul_scalar(b, c, (u64)121665U); fadd(b, b, d); fmul2(nq, dc, ab, tmp2); } static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1) { u64 tmp2[16U] = { 0U }; u64 p01_tmp1_swap[33U] = { 0U }; u64 *p0 = p01_tmp1_swap; u64 *p01 = p01_tmp1_swap; u64 *p03 = p01; u64 *p11 = p01 + (u32)8U; u64 *x0; u64 *z0; u64 *p01_tmp1; u64 *p01_tmp11; u64 *nq10; u64 *nq_p11; u64 *swap1; u64 sw0; u64 *nq1; u64 *tmp1; memcpy(p11, init1, (u32)8U * sizeof(init1[0U])); x0 = p03; z0 = p03 + (u32)4U; x0[0U] = (u64)1U; x0[1U] = (u64)0U; x0[2U] = (u64)0U; x0[3U] = (u64)0U; z0[0U] = (u64)0U; z0[1U] = (u64)0U; z0[2U] = (u64)0U; z0[3U] = (u64)0U; p01_tmp1 = p01_tmp1_swap; p01_tmp11 = p01_tmp1_swap; nq10 = p01_tmp1_swap; nq_p11 = p01_tmp1_swap + (u32)8U; swap1 = p01_tmp1_swap + (u32)32U; cswap2((u64)1U, nq10, nq_p11); point_add_and_double(init1, p01_tmp11, tmp2); swap1[0U] = (u64)1U; { u32 i; for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) { u64 *p01_tmp12 = p01_tmp1_swap; u64 *swap2 = p01_tmp1_swap + (u32)32U; u64 *nq2 = p01_tmp12; u64 *nq_p12 = p01_tmp12 + (u32)8U; u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U); u64 sw = swap2[0U] ^ bit; cswap2(sw, nq2, nq_p12); point_add_and_double(init1, p01_tmp12, tmp2); swap2[0U] = bit; } } sw0 = swap1[0U]; cswap2(sw0, nq10, nq_p11); nq1 = p01_tmp1; tmp1 = p01_tmp1 + (u32)16U; point_double(nq1, tmp1, tmp2); point_double(nq1, tmp1, tmp2); point_double(nq1, tmp1, tmp2); memcpy(out, p0, (u32)8U * sizeof(p0[0U])); memzero_explicit(tmp2, sizeof(tmp2)); memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap)); } static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1) { u32 i; fsqr(o, inp, tmp); for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U) fsqr(o, o, tmp); } static void finv(u64 *o, const u64 *i, u64 *tmp) { u64 t1[16U] = { 0U }; u64 *a0 = t1; u64 *b = t1 + (u32)4U; u64 *c = t1 + (u32)8U; u64 *t00 = t1 + (u32)12U; u64 *tmp1 = tmp; u64 *a; u64 *t0; fsquare_times(a0, i, tmp1, (u32)1U); fsquare_times(t00, a0, tmp1, (u32)2U); fmul(b, t00, i, tmp); fmul(a0, b, a0, tmp); fsquare_times(t00, a0, tmp1, (u32)1U); fmul(b, t00, b, tmp); fsquare_times(t00, b, tmp1, (u32)5U); fmul(b, t00, b, tmp); fsquare_times(t00, b, tmp1, (u32)10U); fmul(c, t00, b, tmp); fsquare_times(t00, c, tmp1, (u32)20U); fmul(t00, t00, c, tmp); fsquare_times(t00, t00, tmp1, (u32)10U); fmul(b, t00, b, tmp); fsquare_times(t00, b, tmp1, (u32)50U); fmul(c, t00, b, tmp); fsquare_times(t00, c, tmp1, (u32)100U); fmul(t00, t00, c, tmp); fsquare_times(t00, t00, tmp1, (u32)50U); fmul(t00, t00, b, tmp); fsquare_times(t00, t00, tmp1, (u32)5U); a = t1; t0 = t1 + (u32)12U; fmul(o, t0, a, tmp); } static void store_felem(u64 *b, u64 *f) { u64 f30 = f[3U]; u64 top_bit0 = f30 >> (u32)63U; u64 f31; u64 top_bit; u64 f0; u64 f1; u64 f2; u64 f3; u64 m0; u64 m1; u64 m2; u64 m3; u64 mask; u64 f0_; u64 f1_; u64 f2_; u64 f3_; u64 o0; u64 o1; u64 o2; u64 o3; f[3U] = f30 & (u64)0x7fffffffffffffffU; add_scalar(f, f, (u64)19U * top_bit0); f31 = f[3U]; top_bit = f31 >> (u32)63U; f[3U] = f31 & (u64)0x7fffffffffffffffU; add_scalar(f, f, (u64)19U * top_bit); f0 = f[0U]; f1 = f[1U]; f2 = f[2U]; f3 = f[3U]; m0 = gte_mask(f0, (u64)0xffffffffffffffedU); m1 = eq_mask(f1, (u64)0xffffffffffffffffU); m2 = eq_mask(f2, (u64)0xffffffffffffffffU); m3 = eq_mask(f3, (u64)0x7fffffffffffffffU); mask = ((m0 & m1) & m2) & m3; f0_ = f0 - (mask & (u64)0xffffffffffffffedU); f1_ = f1 - (mask & (u64)0xffffffffffffffffU); f2_ = f2 - (mask & (u64)0xffffffffffffffffU); f3_ = f3 - (mask & (u64)0x7fffffffffffffffU); o0 = f0_; o1 = f1_; o2 = f2_; o3 = f3_; b[0U] = o0; b[1U] = o1; b[2U] = o2; b[3U] = o3; } static void encode_point(u8 *o, const u64 *i) { const u64 *x = i; const u64 *z = i + (u32)4U; u64 tmp[4U] = { 0U }; u64 tmp_w[16U] = { 0U }; finv(tmp, z, tmp_w); fmul(tmp, tmp, x, tmp_w); store_felem((u64 *)o, tmp); } static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub) { u64 init1[8U] = { 0U }; u64 tmp[4U] = { 0U }; u64 tmp3; u64 *x; u64 *z; { u32 i; for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) { u64 *os = tmp; const u8 *bj = pub + i * (u32)8U; u64 u = *(u64 *)bj; u64 r = u; u64 x0 = r; os[i] = x0; } } tmp3 = tmp[3U]; tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU; x = init1; z = init1 + (u32)4U; z[0U] = (u64)1U; z[1U] = (u64)0U; z[2U] = (u64)0U; z[3U] = (u64)0U; x[0U] = tmp[0U]; x[1U] = tmp[1U]; x[2U] = tmp[2U]; x[3U] = tmp[3U]; montgomery_ladder(init1, priv, init1); encode_point(out, init1); } /* The below constants were generated using this sage script: * * #!/usr/bin/env sage * import sys * from sage.all import * * def limbs(n): * n = int(n) * l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64) * return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0]) * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0] * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s)) * print("static const u64 table_ladder[] = {") * p = ec.lift_x(9) * for i in range(252): * l = (p[0] + p[2]) / (p[0] - p[2]) * print(("\t%s" + ("," if i != 251 else "")) % limbs(l)) * p = p * 2 * print("};") * */ static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL }; static const u64 table_ladder[] = { 0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL, 0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL, 0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL, 0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL, 0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL, 0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL, 0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL, 0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL, 0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL, 0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL, 0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL, 0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL, 0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL, 0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL, 0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL, 0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL, 0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL, 0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL, 0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL, 0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL, 0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL, 0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL, 0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL, 0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL, 0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL, 0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL, 0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL, 0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL, 0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL, 0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL, 0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL, 0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL, 0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL, 0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL, 0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL, 0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL, 0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL, 0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL, 0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL, 0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL, 0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL, 0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL, 0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL, 0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL, 0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL, 0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL, 0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL, 0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL, 0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL, 0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL, 0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL, 0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL, 0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL, 0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL, 0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL, 0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL, 0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL, 0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL, 0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL, 0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL, 0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL, 0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL, 0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL, 0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL, 0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL, 0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL, 0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL, 0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL, 0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL, 0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL, 0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL, 0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL, 0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL, 0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL, 0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL, 0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL, 0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL, 0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL, 0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL, 0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL, 0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL, 0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL, 0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL, 0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL, 0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL, 0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL, 0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL, 0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL, 0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL, 0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL, 0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL, 0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL, 0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL, 0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL, 0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL, 0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL, 0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL, 0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL, 0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL, 0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL, 0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL, 0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL, 0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL, 0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL, 0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL, 0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL, 0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL, 0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL, 0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL, 0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL, 0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL, 0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL, 0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL, 0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL, 0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL, 0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL, 0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL, 0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL, 0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL, 0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL, 0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL, 0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL, 0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL, 0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL, 0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL, 0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL, 0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL, 0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL, 0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL, 0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL, 0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL, 0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL, 0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL, 0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL, 0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL, 0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL, 0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL, 0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL, 0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL, 0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL, 0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL, 0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL, 0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL, 0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL, 0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL, 0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL, 0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL, 0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL, 0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL, 0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL, 0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL, 0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL, 0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL, 0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL, 0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL, 0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL, 0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL, 0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL, 0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL, 0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL, 0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL, 0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL, 0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL, 0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL, 0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL, 0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL, 0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL, 0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL, 0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL, 0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL, 0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL, 0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL, 0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL, 0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL, 0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL, 0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL, 0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL, 0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL, 0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL, 0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL, 0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL, 0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL, 0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL, 0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL, 0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL, 0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL, 0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL, 0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL, 0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL, 0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL, 0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL, 0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL, 0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL, 0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL, 0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL, 0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL, 0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL, 0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL, 0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL, 0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL, 0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL, 0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL, 0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL, 0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL, 0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL, 0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL, 0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL, 0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL, 0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL, 0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL, 0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL, 0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL, 0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL, 0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL, 0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL, 0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL, 0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL, 0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL, 0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL, 0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL, 0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL, 0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL, 0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL, 0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL, 0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL, 0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL, 0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL, 0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL, 0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL, 0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL, 0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL, 0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL, 0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL, 0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL, 0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL, 0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL, 0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL, 0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL, 0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL, 0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL, 0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL, 0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL, 0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL, 0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL, 0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL, 0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL, 0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL, 0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL, 0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL, 0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL, 0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL, 0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL }; static void curve25519_ever64_base(u8 *out, const u8 *priv) { u64 swap = 1; int i, j, k; u64 tmp[16 + 32 + 4]; u64 *x1 = &tmp[0]; u64 *z1 = &tmp[4]; u64 *x2 = &tmp[8]; u64 *z2 = &tmp[12]; u64 *xz1 = &tmp[0]; u64 *xz2 = &tmp[8]; u64 *a = &tmp[0 + 16]; u64 *b = &tmp[4 + 16]; u64 *c = &tmp[8 + 16]; u64 *ab = &tmp[0 + 16]; u64 *abcd = &tmp[0 + 16]; u64 *ef = &tmp[16 + 16]; u64 *efgh = &tmp[16 + 16]; u64 *key = &tmp[0 + 16 + 32]; memcpy(key, priv, 32); ((u8 *)key)[0] &= 248; ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64; x1[0] = 1, x1[1] = x1[2] = x1[3] = 0; z1[0] = 1, z1[1] = z1[2] = z1[3] = 0; z2[0] = 1, z2[1] = z2[2] = z2[3] = 0; memcpy(x2, p_minus_s, sizeof(p_minus_s)); j = 3; for (i = 0; i < 4; ++i) { while (j < (const int[]){ 64, 64, 64, 63 }[i]) { u64 bit = (key[i] >> j) & 1; k = (64 * i + j - 3); swap = swap ^ bit; cswap2(swap, xz1, xz2); swap = bit; fsub(b, x1, z1); fadd(a, x1, z1); fmul(c, &table_ladder[4 * k], b, ef); fsub(b, a, c); fadd(a, a, c); fsqr2(ab, ab, efgh); fmul2(xz1, xz2, ab, efgh); ++j; } j = 0; } point_double(xz1, abcd, efgh); point_double(xz1, abcd, efgh); point_double(xz1, abcd, efgh); encode_point(out, xz1); memzero_explicit(tmp, sizeof(tmp)); } static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx); static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], const u8 basepoint[CURVE25519_KEY_SIZE]) { if (static_branch_likely(&curve25519_use_bmi2_adx)) curve25519_ever64(mypublic, secret, basepoint); else curve25519_generic(mypublic, secret, basepoint); } static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE]) { if (static_branch_likely(&curve25519_use_bmi2_adx)) curve25519_ever64_base(pub, secret); else curve25519_generic(pub, secret, curve25519_base_point); } #define curve25519_mod_init_arch curve25519_mod_init_arch static void curve25519_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) static_branch_enable(&curve25519_use_bmi2_adx); }
48 1 38 3 6 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 // SPDX-License-Identifier: GPL-2.0-only /* iptables module for the packet checksum mangling * * (C) 2002 by Harald Welte <laforge@netfilter.org> * (C) 2010 Red Hat, Inc. * * Author: Michael S. Tsirkin <mst@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_CHECKSUM.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>"); MODULE_DESCRIPTION("Xtables: checksum modification"); MODULE_ALIAS("ipt_CHECKSUM"); MODULE_ALIAS("ip6t_CHECKSUM"); static unsigned int checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) { if (skb->ip_summed == CHECKSUM_PARTIAL && !skb_is_gso(skb)) skb_checksum_help(skb); return XT_CONTINUE; } static int checksum_tg_check(const struct xt_tgchk_param *par) { const struct xt_CHECKSUM_info *einfo = par->targinfo; const struct ip6t_ip6 *i6 = par->entryinfo; const struct ipt_ip *i4 = par->entryinfo; if (einfo->operation & ~XT_CHECKSUM_OP_FILL) { pr_info_ratelimited("unsupported CHECKSUM operation %x\n", einfo->operation); return -EINVAL; } if (!einfo->operation) return -EINVAL; switch (par->family) { case NFPROTO_IPV4: if (i4->proto == IPPROTO_UDP && (i4->invflags & XT_INV_PROTO) == 0) return 0; break; case NFPROTO_IPV6: if ((i6->flags & IP6T_F_PROTO) && i6->proto == IPPROTO_UDP && (i6->invflags & XT_INV_PROTO) == 0) return 0; break; } pr_warn_once("CHECKSUM should be avoided. If really needed, restrict with \"-p udp\" and only use in OUTPUT\n"); return 0; } static struct xt_target checksum_tg_reg[] __read_mostly = { { .name = "CHECKSUM", .family = NFPROTO_IPV4, .target = checksum_tg, .targetsize = sizeof(struct xt_CHECKSUM_info), .table = "mangle", .checkentry = checksum_tg_check, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "CHECKSUM", .family = NFPROTO_IPV6, .target = checksum_tg, .targetsize = sizeof(struct xt_CHECKSUM_info), .table = "mangle", .checkentry = checksum_tg_check, .me = THIS_MODULE, }, #endif }; static int __init checksum_tg_init(void) { return xt_register_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg)); } static void __exit checksum_tg_exit(void) { xt_unregister_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg)); } module_init(checksum_tg_init); module_exit(checksum_tg_exit);
8552 483 10 7 310 305 10 10 10 10 87 87 87 2 72 5 5 776 17 760 5907 5912 5871 5663 218 562 213 5 181 7 34 3 183 41 673 16 657 647 28 8 672 670 3 639 41 34 645 673 667 8 8 674 673 2 674 674 673 674 674 674 674 674 41 645 674 674 1687 1689 1690 966 804 915 10 873 1688 1689 1688 1100 1076 713 103 619 713 681 49 713 52 674 71 608 673 635 44 674 674 674 645 674 674 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #include <linux/dcache.h> #include <linux/fs.h> #include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/srcu.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" /* * Clear all of the marks on an inode when it is being evicted from core */ void __fsnotify_inode_delete(struct inode *inode) { fsnotify_clear_marks_by_inode(inode); } EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); void __fsnotify_vfsmount_delete(struct vfsmount *mnt) { fsnotify_clear_marks_by_mount(mnt); } void __fsnotify_mntns_delete(struct mnt_namespace *mntns) { fsnotify_clear_marks_by_mntns(mntns); } /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ static void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ spin_lock(&inode->i_lock); if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } /* * If i_count is zero, the inode cannot have any watches and * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. * However, we should have been called /after/ evict_inodes * removed all zero refcount inodes, in any case. Test to * be sure. */ if (!icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); /* for each watch, send FS_UNMOUNT and then remove it */ fsnotify_inode(inode, FS_UNMOUNT); fsnotify_inode_delete(inode); iput_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); } void fsnotify_sb_delete(struct super_block *sb) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); /* Were any marks ever added to any object on this sb? */ if (!sbinfo) return; fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ wait_var_event(fsnotify_sb_watched_objects(sb), !atomic_long_read(fsnotify_sb_watched_objects(sb))); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT)); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT)); } void fsnotify_sb_free(struct super_block *sb) { kfree(sb->s_fsnotify_info); } /* * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void fsnotify_set_children_dentry_flags(struct inode *inode) { struct dentry *alias; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { struct dentry *child; /* run all of the children of the original inode and fix their * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); hlist_for_each_entry(child, &alias->d_children, d_sib) { if (!child->d_inode) continue; spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); } /* * Lazily clear false positive PARENT_WATCHED flag for child whose parent had * stopped watching children. */ static void fsnotify_clear_child_dentry_flag(struct inode *pinode, struct dentry *dentry) { spin_lock(&dentry->d_lock); /* * d_lock is a sufficient barrier to prevent observing a non-watched * parent state from before the fsnotify_set_children_dentry_flags() * or fsnotify_update_flags() call that had set PARENT_WATCHED. */ if (!fsnotify_inode_watches_children(pinode)) dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&dentry->d_lock); } /* Are inode/sb/mount interested in parent and name info with this event? */ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = 0; /* We only send parent/name to inode/sb/mount for events on non-dir */ if (mask & FS_ISDIR) return false; /* * All events that are possible on child can also may be reported with * parent/name info to inode/sb/mount. Otherwise, a watching parent * could result in events reported with unexpected name info to sb/mount. */ BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT); /* Did either inode/sb/mount subscribe for events with parent/name? */ marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_sb->s_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask(mnt_mask); /* Did they subscribe for this event with parent/name info? */ return mask & marks_mask; } /* Are there any inode/mount/sb objects that watch for these events? */ static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask | READ_ONCE(inode->i_sb->s_fsnotify_mask); return mask & marks_mask & ALL_FSNOTIFY_EVENTS; } /* Report pre-content event with optional range info */ int fsnotify_pre_content(const struct path *path, const loff_t *ppos, size_t count) { struct file_range range; /* Report page aligned range only when pos is known */ if (!ppos) return fsnotify_path(path, FS_PRE_ACCESS); range.path = path; range.pos = PAGE_ALIGN_DOWN(*ppos); range.count = PAGE_ALIGN(*ppos + count) - range.pos; return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range, FSNOTIFY_EVENT_FILE_RANGE); } /* * Notify this dentry's parent about a child's events with child name info * if parent is watching or if inode/sb/mount are interested in events with * parent and name info. * * Notify only the child without name info if parent is not watching and * inode/sb/mount are not interested in events with parent and name info. */ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { const struct path *path = fsnotify_data_path(data, data_type); __u32 mnt_mask = path ? READ_ONCE(real_mount(path->mnt)->mnt_fsnotify_mask) : 0; struct inode *inode = d_inode(dentry); struct dentry *parent; bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; bool parent_needed, parent_interested; __u32 p_mask; struct inode *p_inode = NULL; struct name_snapshot name; struct qstr *file_name = NULL; int ret = 0; /* Optimize the likely case of nobody watching this path */ if (likely(!parent_watched && !fsnotify_object_watched(inode, mnt_mask, mask))) return 0; parent = NULL; parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask); if (!parent_watched && !parent_needed) goto notify; /* Does parent inode care about events on children? */ parent = dget_parent(dentry); p_inode = parent->d_inode; p_mask = fsnotify_inode_watches_children(p_inode); if (unlikely(parent_watched && !p_mask)) fsnotify_clear_child_dentry_flag(p_inode, dentry); /* * Include parent/name in notification either if some notification * groups require parent info or the parent is interested in this event. * The parent interest in ACCESS/MODIFY events does not apply to special * files, where read/write are not on the filesystem of the parent and * events can provide an undesirable side-channel for information * exfiltration. */ parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS && !(data_type == FSNOTIFY_EVENT_PATH && d_is_special(dentry) && (mask & (FS_ACCESS | FS_MODIFY))); if (parent_needed || parent_interested) { /* When notifying parent, child should be passed as data */ WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type)); /* Notify both parent and child with child name info */ take_dentry_name_snapshot(&name, dentry); file_name = &name.name; if (parent_interested) mask |= FS_EVENT_ON_CHILD; } notify: ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0); if (file_name) release_dentry_name_snapshot(&name); dput(parent); return ret; } EXPORT_SYMBOL_GPL(__fsnotify_parent); static int fsnotify_handle_inode_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct inode *inode = fsnotify_data_inode(data, data_type); const struct fsnotify_ops *ops = group->ops; if (WARN_ON_ONCE(!ops->handle_inode_event)) return 0; if (WARN_ON_ONCE(!inode && !dir)) return 0; if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; /* Check interest of this mark in case event was sent with two marks */ if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS)) return 0; return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie); } static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info); int ret; if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; /* * For FS_RENAME, 'dir' is old dir and 'data' is new dentry. * The only ->handle_inode_event() backend that supports FS_RENAME is * dnotify, where it means file was renamed within same parent. */ if (mask & FS_RENAME) { struct dentry *moved = fsnotify_data_dentry(data, data_type); if (dir != moved->d_parent->d_inode) return 0; } if (parent_mark) { ret = fsnotify_handle_inode_event(group, parent_mark, mask, data, data_type, dir, name, 0); if (ret) return ret; } if (!inode_mark) return 0; /* * Some events can be sent on both parent dir and child marks (e.g. * FS_ATTRIB). If both parent dir and child are watching, report the * event once to parent dir with name (if interested) and once to child * without name (if interested). * * In any case regardless whether the parent is watching or not, the * child watcher is expecting an event without the FS_EVENT_ON_CHILD * flag. The file name is expected if and only if this is a directory * event. */ mask &= ~FS_EVENT_ON_CHILD; if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) { dir = NULL; name = NULL; } return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type, dir, name, cookie); } static int send_to_group(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; __u32 marks_ignore_mask = 0; bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; if (!iter_info->report_mask) return 0; /* clear ignored on inode modification */ if (mask & FS_MODIFY) { fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mark->ignore_mask = 0; } } /* Are any of the group marks interested in this event? */ fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; marks_ignore_mask |= fsnotify_effective_ignore_mask(mark, is_dir, type); } pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { return group->ops->handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } return fsnotify_handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (conn) node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) { struct hlist_node *node = NULL; if (mark) node = srcu_dereference(mark->obj_list.next, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } /* * iter_info is a multi head priority queue of marks. * Pick a subset of marks from queue heads, all with the same group * and set the report_mask to a subset of the selected marks. * Returns false if there are no more groups to iterate. */ static bool fsnotify_iter_select_report_types( struct fsnotify_iter_info *iter_info) { struct fsnotify_group *max_prio_group = NULL; struct fsnotify_mark *mark; int type; /* Choose max prio group among groups of all queue heads */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) > 0) max_prio_group = mark->group; } if (!max_prio_group) return false; /* Set the report mask for marks from same group as max prio group */ iter_info->current_group = max_prio_group; iter_info->report_mask = 0; fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) { /* * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode * is watching children and interested in this event, * which is an event possible on child. * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && !(mark->mask & FS_EVENT_ON_CHILD) && !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); } } return true; } /* * Pop from iter_info multi head queue, the marks that belong to the group of * current iteration step. */ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *mark; int type; /* * We cannot use fsnotify_foreach_iter_mark_type() here because we * may need to advance a mark of type X that belongs to current_group * but was not selected for reporting. */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]); } } /* * fsnotify - This is the main call to fsnotify. * * The VFS calls into hook specific functions in linux/fsnotify.h. * Those functions then in turn call here. Here will call out to all of the * registered fsnotify_group. Those groups can then use the notification event * in whatever means they feel necessary. * * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @inode: optional inode associated with event - * If @dir and @inode are both non-NULL, event may be * reported to both. * @cookie: inotify rename cookie */ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, struct inode *inode, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL; struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; __u32 test_mask, marks_mask = 0; if (path) mnt = real_mount(path->mnt); if (!inode) { /* Dirent event - report on TYPE_INODE to dir */ inode = dir; /* For FS_RENAME, inode is old_dir and inode2 is new_dir */ if (mask & FS_RENAME) { moved = fsnotify_data_dentry(data, data_type); inode2 = moved->d_parent->d_inode; inode2_type = FSNOTIFY_ITER_TYPE_INODE2; } } else if (mask & FS_EVENT_ON_CHILD) { /* * Event on child - report on TYPE_PARENT to dir if it is * watching children and on TYPE_INODE to child. */ inode2 = dir; inode2_type = FSNOTIFY_ITER_TYPE_PARENT; } /* * Optimization: srcu_read_lock() has a memory barrier which can * be expensive. It protects walking the *_fsnotify_marks lists. * However, if we do not walk the lists, we do not have to do * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks) && (!mnt_data || !mnt_data->ns->n_fsnotify_marks)) return 0; if (sb) marks_mask |= READ_ONCE(sb->s_fsnotify_mask); if (mnt) marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask); if (inode) marks_mask |= READ_ONCE(inode->i_fsnotify_mask); if (inode2) marks_mask |= READ_ONCE(inode2->i_fsnotify_mask); if (mnt_data) marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask); /* * If this is a modify event we may need to clear some ignore masks. * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ test_mask = (mask & ALL_FSNOTIFY_EVENTS); if (!(test_mask & marks_mask)) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); if (sbinfo) { iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = fsnotify_first_mark(&sbinfo->sb_marks); } if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } if (inode) { iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] = fsnotify_first_mark(&inode->i_fsnotify_marks); } if (inode2) { iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } if (mnt_data) { iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] = fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks); } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark * ignore masks are properly reflected for mount/sb mark notifications. * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { ret = send_to_group(mask, data, data_type, dir, file_name, cookie, &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; fsnotify_iter_next(&iter_info); } ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret; } EXPORT_SYMBOL_GPL(fsnotify); #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm * hook and set the FMODE_NONOTIFY_ mode bits accordignly. * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ int fsnotify_open_perm_and_set_mode(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; __u32 mnt_mask, p_mask = 0; /* Is it a file opened by fanotify? */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; /* * Permission events is a super set of pre-content events, so if there * are no permission event watchers, there are also no pre-content event * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit. */ if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return 0; } /* * OK, there are some permission event watchers. Check if anybody is * watching for permission events on *this* file. */ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask, ALL_FSNOTIFY_PERM_EVENTS); if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { parent = dget_parent(dentry); p_mask |= fsnotify_inode_watches_children(d_inode(parent)); dput(parent); } /* * Legacy FAN_ACCESS_PERM events have very high performance overhead, * so unlikely to be used in the wild. If they are used there will be * no optimizations at all. */ if (unlikely(p_mask & FS_ACCESS_PERM)) { /* Enable all permission and pre-content events */ file_set_fsnotify_mode(file, 0); goto open_perm; } /* * Pre-content events are only supported on regular files. * If there are pre-content event watchers and no permission access * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. * That is the common case with HSM service. */ if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) { file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); goto open_perm; } /* Nobody watching permission and pre-content events on this file */ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); open_perm: /* * Send open perm events depending on object masks and regardless of * FMODE_NONOTIFY_PERM. */ if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) { int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); if (ret) return ret; } if (p_mask & FS_OPEN_PERM) return fsnotify_path(&file->f_path, FS_OPEN_PERM); return 0; } #endif void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) { struct fsnotify_mnt data = { .ns = ns, .mnt_id = real_mount(mnt)->mnt_id_unique, }; if (WARN_ON_ONCE(!ns)) return; /* * This is an optimization as well as making sure fsnotify_init() has * been called. */ if (!ns->n_fsnotify_marks) return; fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0); } static __init int fsnotify_init(void) { int ret; BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) panic("initializing fsnotify_mark_srcu"); fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, SLAB_PANIC); return 0; } core_initcall(fsnotify_init);
38 38 38 38 104 104 104 104 104 20 20 20 20 146 104 104 104 146 104 20 20 20 20 6 6 6 6 6 6 6 5 6 6 146 146 146 146 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 48