Total coverage: 73408 (5%)of 1820781
52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/filter.h> #include <linux/bpf.h> #include <net/netlink.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <linux/tc_act/tc_bpf.h> #include <net/tc_act/tc_bpf.h> #include <net/tc_wrapper.h> #define ACT_BPF_NAME_LEN 256 struct tcf_bpf_cfg { struct bpf_prog *filter; struct sock_filter *bpf_ops; const char *bpf_name; u16 bpf_num_ops; bool is_ebpf; }; static struct tc_action_ops act_bpf_ops; TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { bool at_ingress = skb_at_tc_ingress(skb); struct tcf_bpf *prog = to_bpf(act); struct bpf_prog *filter; int action, filter_res; tcf_lastuse_update(&prog->tcf_tm); bstats_update(this_cpu_ptr(prog->common.cpu_bstats), skb); filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); /* A BPF program may overwrite the default action opcode. * Similarly as in cls_bpf, if filter_res == -1 we use the * default action specified from tc. * * In case a different well-known TC_ACT opcode has been * returned, it will overwrite the default one. * * For everything else that is unknown, TC_ACT_UNSPEC is * returned. */ switch (filter_res) { case TC_ACT_PIPE: case TC_ACT_RECLASSIFY: case TC_ACT_OK: case TC_ACT_REDIRECT: action = filter_res; break; case TC_ACT_SHOT: action = filter_res; qstats_drop_inc(this_cpu_ptr(prog->common.cpu_qstats)); break; case TC_ACT_UNSPEC: action = prog->tcf_action; break; default: action = TC_ACT_UNSPEC; break; } return action; } static bool tcf_bpf_is_ebpf(const struct tcf_bpf *prog) { return !prog->bpf_ops; } static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog, struct sk_buff *skb) { struct nlattr *nla; if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, prog->bpf_num_ops)) return -EMSGSIZE; nla = nla_reserve(skb, TCA_ACT_BPF_OPS, prog->bpf_num_ops * sizeof(struct sock_filter)); if (nla == NULL) return -EMSGSIZE; memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); return 0; } static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, struct sk_buff *skb) { struct nlattr *nla; if (prog->bpf_name && nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; if (nla_put_u32(skb, TCA_ACT_BPF_ID, prog->filter->aux->id)) return -EMSGSIZE; nla = nla_reserve(skb, TCA_ACT_BPF_TAG, sizeof(prog->filter->tag)); if (nla == NULL) return -EMSGSIZE; memcpy(nla_data(nla), prog->filter->tag, nla_len(nla)); return 0; } static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) { unsigned char *tp = skb_tail_pointer(skb); struct tcf_bpf *prog = to_bpf(act); struct tc_act_bpf opt = { .index = prog->tcf_index, .refcnt = refcount_read(&prog->tcf_refcnt) - ref, .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind, }; struct tcf_t tm; int ret; spin_lock_bh(&prog->tcf_lock); opt.action = prog->tcf_action; if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (tcf_bpf_is_ebpf(prog)) ret = tcf_bpf_dump_ebpf_info(prog, skb); else ret = tcf_bpf_dump_bpf_info(prog, skb); if (ret) goto nla_put_failure; tcf_tm_dump(&tm, &prog->tcf_tm); if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm, TCA_ACT_BPF_PAD)) goto nla_put_failure; spin_unlock_bh(&prog->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&prog->tcf_lock); nlmsg_trim(skb, tp); return -1; } static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = { [TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) }, [TCA_ACT_BPF_FD] = { .type = NLA_U32 }, [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN }, [TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_ACT_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg) { struct sock_filter *bpf_ops; struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; u16 bpf_size, bpf_num_ops; int ret; bpf_num_ops = nla_get_u16(tb[TCA_ACT_BPF_OPS_LEN]); if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) return -EINVAL; bpf_size = bpf_num_ops * sizeof(*bpf_ops); if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS])) return -EINVAL; bpf_ops = kmemdup(nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size, GFP_KERNEL); if (bpf_ops == NULL) return -ENOMEM; fprog_tmp.len = bpf_num_ops; fprog_tmp.filter = bpf_ops; ret = bpf_prog_create(&fp, &fprog_tmp); if (ret < 0) { kfree(bpf_ops); return ret; } cfg->bpf_ops = bpf_ops; cfg->bpf_num_ops = bpf_num_ops; cfg->filter = fp; cfg->is_ebpf = false; return 0; } static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) { struct bpf_prog *fp; char *name = NULL; u32 bpf_fd; bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]); fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_ACT); if (IS_ERR(fp)) return PTR_ERR(fp); if (tb[TCA_ACT_BPF_NAME]) { name = nla_memdup(tb[TCA_ACT_BPF_NAME], GFP_KERNEL); if (!name) { bpf_prog_put(fp); return -ENOMEM; } } cfg->bpf_name = name; cfg->filter = fp; cfg->is_ebpf = true; return 0; } static void tcf_bpf_cfg_cleanup(const struct tcf_bpf_cfg *cfg) { struct bpf_prog *filter = cfg->filter; if (filter) { if (cfg->is_ebpf) bpf_prog_put(filter); else bpf_prog_destroy(filter); } kfree(cfg->bpf_ops); kfree(cfg->bpf_name); } static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, struct tcf_bpf_cfg *cfg) { cfg->is_ebpf = tcf_bpf_is_ebpf(prog); /* updates to prog->filter are prevented, since it's called either * with tcf lock or during final cleanup in rcu callback */ cfg->filter = rcu_dereference_protected(prog->filter, 1); cfg->bpf_ops = prog->bpf_ops; cfg->bpf_name = prog->bpf_name; } static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_bpf_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_bpf_cfg cfg, old; struct tc_act_bpf *parm; struct tcf_bpf *prog; bool is_bpf, is_ebpf; int ret, res = 0; u32 index; if (!nla) return -EINVAL; ret = nla_parse_nested_deprecated(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy, NULL); if (ret < 0) return ret; if (!tb[TCA_ACT_BPF_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_ACT_BPF_PARMS]); index = parm->index; ret = tcf_idr_check_alloc(tn, &index, act, bind); if (!ret) { ret = tcf_idr_create(tn, index, est, act, &act_bpf_ops, bind, true, flags); if (ret < 0) { tcf_idr_cleanup(tn, index); return ret; } res = ACT_P_CREATED; } else if (ret > 0) { /* Don't override defaults. */ if (bind) return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*act, bind); return -EEXIST; } } else { return ret; } ret = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (ret < 0) goto release_idr; is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; is_ebpf = tb[TCA_ACT_BPF_FD]; if (is_bpf == is_ebpf) { ret = -EINVAL; goto put_chain; } memset(&cfg, 0, sizeof(cfg)); ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : tcf_bpf_init_from_efd(tb, &cfg); if (ret < 0) goto put_chain; prog = to_bpf(*act); spin_lock_bh(&prog->tcf_lock); if (res != ACT_P_CREATED) tcf_bpf_prog_fill_cfg(prog, &old); prog->bpf_ops = cfg.bpf_ops; prog->bpf_name = cfg.bpf_name; if (cfg.bpf_num_ops) prog->bpf_num_ops = cfg.bpf_num_ops; goto_ch = tcf_action_set_ctrlact(*act, parm->action, goto_ch); rcu_assign_pointer(prog->filter, cfg.filter); spin_unlock_bh(&prog->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (res != ACT_P_CREATED) { /* make sure the program being replaced is no longer executing */ synchronize_rcu(); tcf_bpf_cfg_cleanup(&old); } return res; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*act, bind); return ret; } static void tcf_bpf_cleanup(struct tc_action *act) { struct tcf_bpf_cfg tmp; tcf_bpf_prog_fill_cfg(to_bpf(act), &tmp); tcf_bpf_cfg_cleanup(&tmp); } static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .id = TCA_ID_BPF, .owner = THIS_MODULE, .act = tcf_bpf_act, .dump = tcf_bpf_dump, .cleanup = tcf_bpf_cleanup, .init = tcf_bpf_init, .size = sizeof(struct tcf_bpf), }; MODULE_ALIAS_NET_ACT("bpf"); static __net_init int bpf_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_bpf_ops.net_id); return tc_action_net_init(net, tn, &act_bpf_ops); } static void __net_exit bpf_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_bpf_ops.net_id); } static struct pernet_operations bpf_net_ops = { .init = bpf_init_net, .exit_batch = bpf_exit_net, .id = &act_bpf_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init bpf_init_module(void) { return tcf_register_action(&act_bpf_ops, &bpf_net_ops); } static void __exit bpf_cleanup_module(void) { tcf_unregister_action(&act_bpf_ops, &bpf_net_ops); } module_init(bpf_init_module); module_exit(bpf_cleanup_module); MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>"); MODULE_DESCRIPTION("TC BPF based action"); MODULE_LICENSE("GPL v2");
40 203 122 123 122 122 118 118 115 41 41 40 32 8 40 59 59 58 58 2 2 22 22 13 2 11 10 2 11 11 35 35 34 29 29 1 1 58 56 1 55 56 55 1 55 52 52 52 52 72 161 2 1 1 1 1 1 1 1 1 1 2 161 121 121 1 120 52 52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: GRE over IP protocol decoder. * * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/ip_tunnels.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/gre.h> #include <net/dst_metadata.h> #include <net/erspan.h> #include <net/inet_dscp.h> /* Problems & solutions -------------------- 1. The most important issue is detecting local dead loops. They would cause complete host lockup in transmit, which would be "resolved" by stack overflow or, if queueing is enabled, with infinite looping in net_bh. We cannot track such dead loops during route installation, it is infeasible task. The most general solutions would be to keep skb->encapsulation counter (sort of local ttl), and silently drop packet when it expires. It is a good solution, but it supposes maintaining new variable in ALL skb, even if no tunneling is used. Current solution: xmit_recursion breaks dead loops. This is a percpu counter, since when we enter the first ndo_xmit(), cpu migration is forbidden. We force an exit if this counter reaches RECURSION_LIMIT 2. Networking dead loops would not kill routers, but would really kill network. IP hop limit plays role of "t->recursion" in this case, if we copy it from packet being encapsulated to upper header. It is very good solution, but it introduces two problems: - Routing protocols, using packets with ttl=1 (OSPF, RIP2), do not work over tunnels. - traceroute does not work. I planned to relay ICMP from tunnel, so that this problem would be solved and traceroute output would even more informative. This idea appeared to be wrong: only Linux complies to rfc1812 now (yes, guys, Linux is the only true router now :-)), all routers (at least, in neighbourhood of mine) return only 8 bytes of payload. It is the end. Hence, if we want that OSPF worked or traceroute said something reasonable, we should search for another solution. One of them is to parse packet trying to detect inner encapsulation made by our node. It is difficult or even impossible, especially, taking into account fragmentation. TO be short, ttl is not solution at all. Current solution: The solution was UNEXPECTEDLY SIMPLE. We force DF flag on tunnels with preconfigured hop limit, that is ALL. :-) Well, it does not remove the problem completely, but exponential growth of network traffic is changed to linear (branches, that exceed pmtu are pruned) and tunnel mtu rapidly degrades to value <68, where looping stops. Yes, it is not good if there exists a router in the loop, which does not force DF, even when encapsulating packets have DF set. But it is not our problem! Nobody could accuse us, we made all that we could make. Even if it is your gated who injected fatal route to network, even if it were you who configured fatal static route: you are innocent. :-) Alexey Kuznetsov. */ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static const struct header_ops ipgre_header_ops; static int ipgre_tunnel_init(struct net_device *dev); static void erspan_build_header(struct sk_buff *skb, u32 id, u32 index, bool truncate, bool is_ipv4); static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; static unsigned int erspan_net_id __read_mostly; static int ipgre_err(struct sk_buff *skb, u32 info, const struct tnl_ptk_info *tpi) { /* All the routers (except for Linux) return only 8 bytes of packet payload. It means, that precise relaying of ICMP in the real Internet is absolutely infeasible. Moreover, Cisco "wise men" put GRE key to the third word in GRE header. It makes impossible maintaining even soft state for keyed GRE tunnels with enabled checksum. Tell them "thank you". Well, I wonder, rfc1812 was written by Cisco employee, what the hell these idiots break standards established by themselves??? */ struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn; const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; unsigned int data_len = 0; struct ip_tunnel *t; if (tpi->proto == htons(ETH_P_TEB)) itn = net_generic(net, gre_tap_net_id); else if (tpi->proto == htons(ETH_P_ERSPAN) || tpi->proto == htons(ETH_P_ERSPAN2)) itn = net_generic(net, erspan_net_id); else itn = net_generic(net, ipgre_net_id); iph = (const struct iphdr *)(icmp_hdr(skb) + 1); t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->daddr, iph->saddr, tpi->key); if (!t) return -ENOENT; switch (type) { default: case ICMP_PARAMETERPROB: return 0; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, I believe they are just ether pollution. --ANK */ break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: break; } #if IS_ENABLED(CONFIG_IPV6) if (tpi->proto == htons(ETH_P_IPV6) && !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, type, data_len)) return 0; #endif if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) return 0; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) return 0; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; return 0; } static void gre_err(struct sk_buff *skb, u32 info) { /* All the routers (except for Linux) return only * 8 bytes of packet payload. It means, that precise relaying of * ICMP in the real Internet is absolutely infeasible. * * Moreover, Cisco "wise men" put GRE key to the third word * in GRE header. It makes impossible maintaining even soft * state for keyed * GRE tunnels with enabled checksum. Tell them "thank you". * * Well, I wonder, rfc1812 was written by Cisco employee, * what the hell these idiots break standards established * by themselves??? */ const struct iphdr *iph = (struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct tnl_ptk_info tpi; if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP), iph->ihl * 4) < 0) return; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, skb->dev->ifindex, IPPROTO_GRE); return; } if (type == ICMP_REDIRECT) { ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, IPPROTO_GRE); return; } ipgre_err(skb, info, &tpi); } static bool is_erspan_type1(int gre_hdr_len) { /* Both ERSPAN type I (version 0) and type II (version 1) use * protocol 0x88BE, but the type I has only 4-byte GRE header, * while type II has 8-byte. */ return gre_hdr_len == 4; } static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int gre_hdr_len) { struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; struct erspan_base_hdr *ershdr; IP_TUNNEL_DECLARE_FLAGS(flags); struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; const struct iphdr *iph; struct erspan_md2 *md2; int ver; int len; ip_tunnel_flags_copy(flags, tpi->flags); itn = net_generic(net, erspan_net_id); iph = ip_hdr(skb); if (is_erspan_type1(gre_hdr_len)) { ver = 0; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); } else { if (unlikely(!pskb_may_pull(skb, gre_hdr_len + sizeof(*ershdr)))) return PACKET_REJECT; ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; iph = ip_hdr(skb); __set_bit(IP_TUNNEL_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, tpi->key); } if (tunnel) { if (is_erspan_type1(gre_hdr_len)) len = gre_hdr_len; else len = gre_hdr_len + erspan_hdr_len(ver); if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; if (__iptunnel_pull_header(skb, len, htons(ETH_P_TEB), false, false) < 0) goto drop; if (tunnel->collect_md) { struct erspan_metadata *pkt_md, *md; struct ip_tunnel_info *info; unsigned char *gh; __be64 tun_id; __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags); ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, tun_id, sizeof(*md)); if (!tun_dst) return PACKET_REJECT; /* skb can be uncloned in __iptunnel_pull_header, so * old pkt_md is no longer valid and we need to reset * it */ gh = skb_network_header(skb) + skb_network_header_len(skb); pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + sizeof(*ershdr)); md = ip_tunnel_info_opts(&tun_dst->u.tun_info); md->version = ver; md2 = &md->u.md2; memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); info = &tun_dst->u.tun_info; __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); info->options_len = sizeof(*md); } skb_reset_mac_header(skb); ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; } return PACKET_REJECT; drop: kfree_skb(skb); return PACKET_RCVD; } static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) { struct metadata_dst *tun_dst = NULL; const struct iphdr *iph; struct ip_tunnel *tunnel; iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->saddr, iph->daddr, tpi->key); if (tunnel) { const struct iphdr *tnl_params; if (__iptunnel_pull_header(skb, hdr_len, tpi->proto, raw_proto, false) < 0) goto drop; /* Special case for ipgre_header_parse(), which expects the * mac_header to point to the outer IP header. */ if (tunnel->dev->header_ops == &ipgre_header_ops) skb_pop_mac_header(skb); else skb_reset_mac_header(skb); tnl_params = &tunnel->parms.iph; if (tunnel->collect_md || tnl_params->daddr == 0) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __be64 tun_id; __set_bit(IP_TUNNEL_CSUM_BIT, flags); __set_bit(IP_TUNNEL_KEY_BIT, flags); ip_tunnel_flags_and(flags, tpi->flags, flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); if (!tun_dst) return PACKET_REJECT; } ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; } return PACKET_NEXT; drop: kfree_skb(skb); return PACKET_RCVD; } static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, int hdr_len) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn; int res; if (tpi->proto == htons(ETH_P_TEB)) itn = net_generic(net, gre_tap_net_id); else itn = net_generic(net, ipgre_net_id); res = __ipgre_rcv(skb, tpi, itn, hdr_len, false); if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) { /* ipgre tunnels in collect metadata mode should receive * also ETH_P_TEB traffic. */ itn = net_generic(net, ipgre_net_id); res = __ipgre_rcv(skb, tpi, itn, hdr_len, true); } return res; } static int gre_rcv(struct sk_buff *skb) { struct tnl_ptk_info tpi; bool csum_err = false; int hdr_len; #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { /* Looped back packet, drop it! */ if (rt_is_output_route(skb_rtable(skb))) goto drop; } #endif hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0); if (hdr_len < 0) goto drop; if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || tpi.proto == htons(ETH_P_ERSPAN2))) { if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; goto out; } if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); IP_TUNNEL_DECLARE_FLAGS(flags); ip_tunnel_flags_copy(flags, tunnel->parms.o_flags); /* Push GRE header. */ gre_build_header(skb, tunnel->tun_hlen, flags, proto, tunnel->parms.o_key, test_bit(IP_TUNNEL_SEQ_BIT, flags) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } static int gre_handle_offloads(struct sk_buff *skb, bool csum) { return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; int tunnel_hlen; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET)) goto err_free_skb; key = &tun_info->key; tunnel_hlen = gre_calc_hlen(key->tun_flags); if (skb_cow_head(skb, dev->needed_headroom)) goto err_free_skb; /* Push Tunnel header. */ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags))) goto err_free_skb; __set_bit(IP_TUNNEL_CSUM_BIT, flags); __set_bit(IP_TUNNEL_KEY_BIT, flags); __set_bit(IP_TUNNEL_SEQ_BIT, flags); ip_tunnel_flags_and(flags, tun_info->key.tun_flags, flags); gre_build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key32(tun_info->key.tun_id), test_bit(IP_TUNNEL_SEQ_BIT, flags) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); return; err_free_skb: kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); } static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct erspan_metadata *md; bool truncate = false; __be16 proto; int tunnel_hlen; int version; int nhoff; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET)) goto err_free_skb; key = &tun_info->key; if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags)) goto err_free_skb; if (tun_info->options_len < sizeof(*md)) goto err_free_skb; md = ip_tunnel_info_opts(tun_info); /* ERSPAN has fixed 8 byte GRE header */ version = md->version; tunnel_hlen = 8 + erspan_hdr_len(version); if (skb_cow_head(skb, dev->needed_headroom)) goto err_free_skb; if (gre_handle_offloads(skb, false)) goto err_free_skb; if (skb->len > dev->mtu + dev->hard_header_len) { if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) goto err_free_skb; truncate = true; } nhoff = skb_network_offset(skb); if (skb->protocol == htons(ETH_P_IP) && (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) truncate = true; if (skb->protocol == htons(ETH_P_IPV6)) { int thoff; if (skb_transport_header_was_set(skb)) thoff = skb_transport_offset(skb); else thoff = nhoff + sizeof(struct ipv6hdr); if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff) truncate = true; } if (version == 1) { erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), ntohl(md->u.index), truncate, true); proto = htons(ETH_P_ERSPAN); } else if (version == 2) { erspan_build_header_v2(skb, ntohl(tunnel_id_to_key32(key->tun_id)), md->u.md2.dir, get_hwid(&md->u.md2), truncate, true); proto = htons(ETH_P_ERSPAN2); } else { goto err_free_skb; } __set_bit(IP_TUNNEL_SEQ_BIT, flags); gre_build_header(skb, 8, flags, proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno))); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); return; err_free_skb: kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); } static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct ip_tunnel_key *key; struct rtable *rt; struct flowi4 fl4; if (ip_tunnel_info_af(info) != AF_INET) return -EINVAL; key = &info->key; ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), key->tos & ~INET_ECN_MASK, dev_net(dev), 0, skb->mark, skb_get_hash(skb), key->flow_flags); rt = ip_route_output_key(dev_net(dev), &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); info->key.u.ipv4.src = fl4.saddr; return 0; } static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tnl_params; if (!pskb_inet_may_pull(skb)) goto free_skb; if (tunnel->collect_md) { gre_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; } if (dev->header_ops) { int pull_len = tunnel->hlen + sizeof(struct iphdr); if (skb_cow_head(skb, 0)) goto free_skb; if (!pskb_may_pull(skb, pull_len)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; /* ip_tunnel_xmit() needs skb->data pointing to gre header. */ skb_pull(skb, pull_len); skb_reset_mac_header(skb); if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start(skb) < skb->data) goto free_skb; } else { if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; tnl_params = &tunnel->parms.iph; } if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags))) goto free_skb; __gre_xmit(skb, dev, tnl_params, skb->protocol); return NETDEV_TX_OK; free_skb: kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); bool truncate = false; __be16 proto; if (!pskb_inet_may_pull(skb)) goto free_skb; if (tunnel->collect_md) { erspan_fb_xmit(skb, dev); return NETDEV_TX_OK; } if (gre_handle_offloads(skb, false)) goto free_skb; if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; if (skb->len > dev->mtu + dev->hard_header_len) { if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) goto free_skb; truncate = true; } /* Push ERSPAN header */ if (tunnel->erspan_ver == 0) { proto = htons(ETH_P_ERSPAN); __clear_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags); } else if (tunnel->erspan_ver == 1) { erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, truncate, true); proto = htons(ETH_P_ERSPAN); } else if (tunnel->erspan_ver == 2) { erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key), tunnel->dir, tunnel->hwid, truncate, true); proto = htons(ETH_P_ERSPAN2); } else { goto free_skb; } __clear_bit(IP_TUNNEL_KEY_BIT, tunnel->parms.o_flags); __gre_xmit(skb, dev, &tunnel->parms.iph, proto); return NETDEV_TX_OK; free_skb: kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); if (!pskb_inet_may_pull(skb)) goto free_skb; if (tunnel->collect_md) { gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); return NETDEV_TX_OK; } if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags))) goto free_skb; if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); return NETDEV_TX_OK; free_skb: kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } static void ipgre_link_update(struct net_device *dev, bool set_mtu) { struct ip_tunnel *tunnel = netdev_priv(dev); int len; len = tunnel->tun_hlen; tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); len = tunnel->tun_hlen - len; tunnel->hlen = tunnel->hlen + len; if (dev->header_ops) dev->hard_header_len += len; else dev->needed_headroom += len; if (set_mtu) WRITE_ONCE(dev->mtu, max_t(int, dev->mtu - len, 68)); if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags) || (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && tunnel->encap.type != TUNNEL_ENCAP_NONE)) { dev->features &= ~NETIF_F_GSO_SOFTWARE; dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; } else { dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; } } static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { __be16 i_flags, o_flags; int err; if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || !ip_tunnel_flags_is_be16_compat(p->o_flags)) return -EOVERFLOW; i_flags = ip_tunnel_flags_to_be16(p->i_flags); o_flags = ip_tunnel_flags_to_be16(p->o_flags); if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) || ((i_flags | o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } gre_flags_to_tnl_flags(p->i_flags, i_flags); gre_flags_to_tnl_flags(p->o_flags, o_flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd == SIOCCHGTUNNEL) { struct ip_tunnel *t = netdev_priv(dev); ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags); ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags); if (strcmp(dev->rtnl_link_ops->kind, "erspan")) ipgre_link_update(dev, true); } i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); ip_tunnel_flags_from_be16(p->i_flags, i_flags); o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); ip_tunnel_flags_from_be16(p->o_flags, o_flags); return 0; } /* Nice toy. Unfortunately, useless in real life :-) It allows to construct virtual multiprotocol broadcast "LAN" over the Internet, provided multicast routing is tuned. I have no idea was this bicycle invented before me, so that I had to set ARPHRD_IPGRE to a random value. I have an impression, that Cisco could make something similar, but this feature is apparently missing in IOS<=11.2(8). I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks with broadcast 224.66.66.66. If you have access to mbone, play with me :-) ping -t 255 224.66.66.66 If nobody answers, mbone does not work. ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255 ip addr add 10.66.66.<somewhat>/24 dev Universe ifconfig Universe up ifconfig Universe add fe80::<Your_real_addr>/10 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96 ftp 10.66.66.66 ... ftp fec0:6666:6666::193.233.7.65 ... */ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ip_tunnel *t = netdev_priv(dev); struct iphdr *iph; struct gre_base_hdr *greh; iph = skb_push(skb, t->hlen + sizeof(*iph)); greh = (struct gre_base_hdr *)(iph+1); greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags); greh->protocol = htons(type); memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); /* Set the source hardware address. */ if (saddr) memcpy(&iph->saddr, saddr, 4); if (daddr) memcpy(&iph->daddr, daddr, 4); if (iph->daddr) return t->hlen + sizeof(*iph); return -(t->hlen + sizeof(*iph)); } static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) { const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb); memcpy(haddr, &iph->saddr, 4); return 4; } static const struct header_ops ipgre_header_ops = { .create = ipgre_header, .parse = ipgre_header_parse, }; #ifdef CONFIG_NET_IPGRE_BROADCAST static int ipgre_open(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { struct flowi4 fl4; struct rtable *rt; rt = ip_route_output_gre(t->net, &fl4, t->parms.iph.daddr, t->parms.iph.saddr, t->parms.o_key, t->parms.iph.tos & INET_DSCP_MASK, t->parms.link); if (IS_ERR(rt)) return -EADDRNOTAVAIL; dev = rt->dst.dev; ip_rt_put(rt); if (!__in_dev_get_rtnl(dev)) return -EADDRNOTAVAIL; t->mlink = dev->ifindex; ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); } return 0; } static int ipgre_close(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; in_dev = inetdev_by_index(t->net, t->mlink); if (in_dev) ip_mc_dec_group(in_dev, t->parms.iph.daddr); } return 0; } #endif static const struct net_device_ops ipgre_netdev_ops = { .ndo_init = ipgre_tunnel_init, .ndo_uninit = ip_tunnel_uninit, #ifdef CONFIG_NET_IPGRE_BROADCAST .ndo_open = ipgre_open, .ndo_stop = ipgre_close, #endif .ndo_start_xmit = ipgre_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipgre_tunnel_ctl, }; #define GRE_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_HW_CSUM) static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; dev->type = ARPHRD_IPGRE; ip_tunnel_setup(dev, ipgre_net_id); } static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; tunnel = netdev_priv(dev); tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph); dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; /* TCP offload with GRE SEQ is not supported, nor can we support 2 * levels of outer headers requiring an update. */ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags)) return; if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && tunnel->encap.type != TUNNEL_ENCAP_NONE) return; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->lltx = true; } static int ipgre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; __gre_tunnel_init(dev); __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; netif_keep_dst(dev); dev->addr_len = 4; if (iph->daddr && !tunnel->collect_md) { #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { if (!iph->saddr) return -EINVAL; dev->flags = IFF_BROADCAST; dev->header_ops = &ipgre_header_ops; dev->hard_header_len = tunnel->hlen + sizeof(*iph); dev->needed_headroom = 0; } #endif } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; dev->hard_header_len = tunnel->hlen + sizeof(*iph); dev->needed_headroom = 0; } return ip_tunnel_init(dev); } static const struct gre_protocol ipgre_protocol = { .handler = gre_rcv, .err_handler = gre_err, }; static int __net_init ipgre_init_net(struct net *net) { return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net, struct list_head *dev_to_kill) { ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops, dev_to_kill); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, .exit_batch_rtnl = ipgre_exit_batch_rtnl, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { __be16 flags; if (!data) return 0; flags = 0; if (data[IFLA_GRE_IFLAGS]) flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); if (data[IFLA_GRE_OFLAGS]) flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); if (flags & (GRE_VERSION|GRE_ROUTING)) return -EINVAL; if (data[IFLA_GRE_COLLECT_METADATA] && data[IFLA_GRE_ENCAP_TYPE] && nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE) return -EINVAL; return 0; } static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { __be32 daddr; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) goto out; if (data[IFLA_GRE_REMOTE]) { memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); if (!daddr) return -EINVAL; } out: return ipgre_tunnel_validate(tb, data, extack); } static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { __be16 flags = 0; int ret; if (!data) return 0; ret = ipgre_tap_validate(tb, data, extack); if (ret) return ret; if (data[IFLA_GRE_ERSPAN_VER] && nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0) return 0; /* ERSPAN type II/III should only have GRE sequence and key flag */ if (data[IFLA_GRE_OFLAGS]) flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); if (data[IFLA_GRE_IFLAGS]) flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); if (!data[IFLA_GRE_COLLECT_METADATA] && flags != (GRE_SEQ | GRE_KEY)) return -EINVAL; /* ERSPAN Session ID only has 10-bit. Since we reuse * 32-bit key field as ID, check it's range. */ if (data[IFLA_GRE_IKEY] && (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK)) return -EINVAL; if (data[IFLA_GRE_OKEY] && (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) return -EINVAL; return 0; } static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_GRE; if (!data) return 0; if (data[IFLA_GRE_LINK]) parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) gre_flags_to_tnl_flags(parms->i_flags, nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) gre_flags_to_tnl_flags(parms->o_flags, nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); if (data[IFLA_GRE_OKEY]) parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); if (data[IFLA_GRE_LOCAL]) parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]); if (data[IFLA_GRE_REMOTE]) parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]); if (data[IFLA_GRE_TTL]) parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); if (data[IFLA_GRE_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) { if (t->ignore_df) return -EINVAL; parms->iph.frag_off = htons(IP_DF); } if (data[IFLA_GRE_COLLECT_METADATA]) { t->collect_md = true; if (dev->type == ARPHRD_IPGRE) dev->type = ARPHRD_NONE; } if (data[IFLA_GRE_IGNORE_DF]) { if (nla_get_u8(data[IFLA_GRE_IGNORE_DF]) && (parms->iph.frag_off & htons(IP_DF))) return -EINVAL; t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]); } if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); return 0; } static int erspan_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); int err; err = ipgre_netlink_parms(dev, data, tb, parms, fwmark); if (err) return err; if (!data) return 0; if (data[IFLA_GRE_ERSPAN_VER]) { t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); if (t->erspan_ver > 2) return -EINVAL; } if (t->erspan_ver == 1) { if (data[IFLA_GRE_ERSPAN_INDEX]) { t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); if (t->index & ~INDEX_MASK) return -EINVAL; } } else if (t->erspan_ver == 2) { if (data[IFLA_GRE_ERSPAN_DIR]) { t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); if (t->dir & ~(DIR_MASK >> DIR_OFFSET)) return -EINVAL; } if (data[IFLA_GRE_ERSPAN_HWID]) { t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); if (t->hwid & ~(HWID_MASK >> HWID_OFFSET)) return -EINVAL; } } return 0; } /* This function returns true when ENCAP attributes are present in the nl msg */ static bool ipgre_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *ipencap) { bool ret = false; memset(ipencap, 0, sizeof(*ipencap)); if (!data) return ret; if (data[IFLA_GRE_ENCAP_TYPE]) { ret = true; ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); } if (data[IFLA_GRE_ENCAP_FLAGS]) { ret = true; ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); } if (data[IFLA_GRE_ENCAP_SPORT]) { ret = true; ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]); } if (data[IFLA_GRE_ENCAP_DPORT]) { ret = true; ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]); } return ret; } static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); return ip_tunnel_init(dev); } static const struct net_device_ops gre_tap_netdev_ops = { .ndo_init = gre_tap_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = gre_tap_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; static int erspan_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); if (tunnel->erspan_ver == 0) tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */ else tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */ tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + erspan_hdr_len(tunnel->erspan_ver); dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); return ip_tunnel_init(dev); } static const struct net_device_ops erspan_netdev_ops = { .ndo_init = erspan_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = erspan_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); dev->max_mtu = 0; dev->netdev_ops = &gre_tap_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip_tunnel_setup(dev, gre_tap_net_id); } static int ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[]) { struct ip_tunnel_encap ipencap; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } return 0; } static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; err = ipgre_newlink_encap_setup(dev, data); if (err) return err; err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; return ip_tunnel_newlink(dev, tb, &p, fwmark); } static int erspan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; err = ipgre_newlink_encap_setup(dev, data); if (err) return err; err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); if (err) return err; return ip_tunnel_newlink(dev, tb, &p, fwmark); } static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; int err; err = ipgre_newlink_encap_setup(dev, data); if (err) return err; err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; err = ip_tunnel_changelink(dev, tb, &p, fwmark); if (err < 0) return err; ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags); ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags); ipgre_link_update(dev, !tb[IFLA_MTU]); return 0; } static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; int err; err = ipgre_newlink_encap_setup(dev, data); if (err) return err; err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; err = ip_tunnel_changelink(dev, tb, &p, fwmark); if (err < 0) return err; ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags); ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags); return 0; } static size_t ipgre_get_size(const struct net_device *dev) { return /* IFLA_GRE_LINK */ nla_total_size(4) + /* IFLA_GRE_IFLAGS */ nla_total_size(2) + /* IFLA_GRE_OFLAGS */ nla_total_size(2) + /* IFLA_GRE_IKEY */ nla_total_size(4) + /* IFLA_GRE_OKEY */ nla_total_size(4) + /* IFLA_GRE_LOCAL */ nla_total_size(4) + /* IFLA_GRE_REMOTE */ nla_total_size(4) + /* IFLA_GRE_TTL */ nla_total_size(1) + /* IFLA_GRE_TOS */ nla_total_size(1) + /* IFLA_GRE_PMTUDISC */ nla_total_size(1) + /* IFLA_GRE_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_GRE_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_GRE_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_GRE_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_GRE_IGNORE_DF */ nla_total_size(1) + /* IFLA_GRE_FWMARK */ nla_total_size(4) + /* IFLA_GRE_ERSPAN_INDEX */ nla_total_size(4) + /* IFLA_GRE_ERSPAN_VER */ nla_total_size(1) + /* IFLA_GRE_ERSPAN_DIR */ nla_total_size(1) + /* IFLA_GRE_ERSPAN_HWID */ nla_total_size(2) + 0; } static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern *p = &t->parms; IP_TUNNEL_DECLARE_FLAGS(o_flags); ip_tunnel_flags_copy(o_flags, p->o_flags); if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, gre_tnl_flags_to_gre_flags(p->i_flags)) || nla_put_be16(skb, IFLA_GRE_OFLAGS, gre_tnl_flags_to_gre_flags(o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) || nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) || nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || nla_put_u8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))) || nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, t->encap.type) || nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT, t->encap.sport) || nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT, t->encap.dport) || nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, t->encap.flags)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df)) goto nla_put_failure; if (t->collect_md) { if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); if (t->erspan_ver <= 2) { if (t->erspan_ver != 0 && !t->collect_md) __set_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags); if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) goto nla_put_failure; if (t->erspan_ver == 1) { if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) goto nla_put_failure; } else if (t->erspan_ver == 2) { if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) goto nla_put_failure; } } return ipgre_fill_info(skb, dev); nla_put_failure: return -EMSGSIZE; } static void erspan_setup(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); ether_setup(dev); dev->max_mtu = 0; dev->netdev_ops = &erspan_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip_tunnel_setup(dev, erspan_net_id); t->erspan_ver = 1; } static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_LINK] = { .type = NLA_U32 }, [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, [IFLA_GRE_IKEY] = { .type = NLA_U32 }, [IFLA_GRE_OKEY] = { .type = NLA_U32 }, [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_GRE_TTL] = { .type = NLA_U8 }, [IFLA_GRE_TOS] = { .type = NLA_U8 }, [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { .kind = "gre", .maxtype = IFLA_GRE_MAX, .policy = ipgre_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipgre_tunnel_setup, .validate = ipgre_tunnel_validate, .newlink = ipgre_newlink, .changelink = ipgre_changelink, .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = ipgre_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { .kind = "gretap", .maxtype = IFLA_GRE_MAX, .policy = ipgre_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipgre_tap_setup, .validate = ipgre_tap_validate, .newlink = ipgre_newlink, .changelink = ipgre_changelink, .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = ipgre_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static struct rtnl_link_ops erspan_link_ops __read_mostly = { .kind = "erspan", .maxtype = IFLA_GRE_MAX, .policy = ipgre_policy, .priv_size = sizeof(struct ip_tunnel), .setup = erspan_setup, .validate = erspan_validate, .newlink = erspan_newlink, .changelink = erspan_changelink, .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = erspan_fill_info, .get_link_net = ip_tunnel_get_link_net, }; struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type) { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; LIST_HEAD(list_kill); struct ip_tunnel *t; int err; memset(&tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, &ipgre_tap_ops, tb, NULL); if (IS_ERR(dev)) return dev; /* Configure flow based GRE device. */ t = netdev_priv(dev); t->collect_md = true; err = ipgre_newlink(net, dev, tb, NULL, NULL); if (err < 0) { free_netdev(dev); return ERR_PTR(err); } /* openvswitch users expect packet sizes to be unrestricted, * so set the largest MTU we can. */ err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false); if (err) goto out; err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto out; return dev; out: ip_tunnel_dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(gretap_fb_dev_create); static int __net_init ipgre_tap_init_net(struct net *net) { return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net, struct list_head *dev_to_kill) { ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, .exit_batch_rtnl = ipgre_tap_exit_batch_rtnl, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; static int __net_init erspan_init_net(struct net *net) { return ip_tunnel_init_net(net, erspan_net_id, &erspan_link_ops, "erspan0"); } static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops, dev_to_kill); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, .exit_batch_rtnl = erspan_exit_batch_rtnl, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; static int __init ipgre_init(void) { int err; pr_info("GRE over IPv4 tunneling driver\n"); err = register_pernet_device(&ipgre_net_ops); if (err < 0) return err; err = register_pernet_device(&ipgre_tap_net_ops); if (err < 0) goto pnet_tap_failed; err = register_pernet_device(&erspan_net_ops); if (err < 0) goto pnet_erspan_failed; err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { pr_info("%s: can't add protocol\n", __func__); goto add_proto_failed; } err = rtnl_link_register(&ipgre_link_ops); if (err < 0) goto rtnl_link_failed; err = rtnl_link_register(&ipgre_tap_ops); if (err < 0) goto tap_ops_failed; err = rtnl_link_register(&erspan_link_ops); if (err < 0) goto erspan_link_failed; return 0; erspan_link_failed: rtnl_link_unregister(&ipgre_tap_ops); tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: unregister_pernet_device(&erspan_net_ops); pnet_erspan_failed: unregister_pernet_device(&ipgre_tap_net_ops); pnet_tap_failed: unregister_pernet_device(&ipgre_net_ops); return err; } static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); rtnl_link_unregister(&erspan_link_ops); gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_net_ops); unregister_pernet_device(&erspan_net_ops); } module_init(ipgre_init); module_exit(ipgre_fini); MODULE_DESCRIPTION("IPv4 GRE tunnels over IP library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); MODULE_ALIAS_RTNL_LINK("erspan"); MODULE_ALIAS_NETDEV("gre0"); MODULE_ALIAS_NETDEV("gretap0"); MODULE_ALIAS_NETDEV("erspan0");
220 219 219 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/cgroup-defs.h - basic definitions for cgroup * * This file provides basic type and interface. Include this file directly * only if necessary to avoid cyclic dependencies. */ #ifndef _LINUX_CGROUP_DEFS_H #define _LINUX_CGROUP_DEFS_H #include <linux/limits.h> #include <linux/list.h> #include <linux/idr.h> #include <linux/wait.h> #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/percpu-refcount.h> #include <linux/percpu-rwsem.h> #include <linux/u64_stats_sync.h> #include <linux/workqueue.h> #include <linux/bpf-cgroup-defs.h> #include <linux/psi_types.h> #ifdef CONFIG_CGROUPS struct cgroup; struct cgroup_root; struct cgroup_subsys; struct cgroup_taskset; struct kernfs_node; struct kernfs_ops; struct kernfs_open_file; struct seq_file; struct poll_table_struct; #define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 #define MAX_CFTYPE_NAME 64 /* define the enumeration of all cgroup subsystems */ #define SUBSYS(_x) _x ## _cgrp_id, enum cgroup_subsys_id { #include <linux/cgroup_subsys.h> CGROUP_SUBSYS_COUNT, }; #undef SUBSYS /* bits in struct cgroup_subsys_state flags field */ enum { CSS_NO_REF = (1 << 0), /* no reference counting for this css */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ CSS_VISIBLE = (1 << 3), /* css is visible to userland */ CSS_DYING = (1 << 4), /* css is dying */ }; /* bits in struct cgroup flags field */ enum { /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, /* * Clone the parent's configuration when creating a new child * cpuset cgroup. For historical reasons, this option can be * specified at mount time and thus is implemented here. */ CGRP_CPUSET_CLONE_CHILDREN, /* Control group has to be frozen. */ CGRP_FREEZE, /* Cgroup is frozen. */ CGRP_FROZEN, /* Control group has to be killed. */ CGRP_KILL, }; /* cgroup_root->flags */ enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ /* * Consider namespaces as delegation boundaries. If this flag is * set, controller specific interface files in a namespace root * aren't writeable from inside the namespace. */ CGRP_ROOT_NS_DELEGATE = (1 << 3), /* * Reduce latencies on dynamic cgroup modifications such as task * migrations and controller on/offs by disabling percpu operation on * cgroup_threadgroup_rwsem. This makes hot path operations such as * forks and exits into the slow path and more expensive. * * The static usage pattern of creating a cgroup, enabling controllers, * and then seeding it with CLONE_INTO_CGROUP doesn't require write * locking cgroup_threadgroup_rwsem and thus doesn't benefit from * favordynmod. */ CGRP_ROOT_FAVOR_DYNMODS = (1 << 4), /* * Enable cpuset controller in v1 cgroup to use v2 behavior. */ CGRP_ROOT_CPUSET_V2_MODE = (1 << 16), /* * Enable legacy local memory.events. */ CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 17), /* * Enable recursive subtree protection */ CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), /* * Enable hugetlb accounting for the memory controller. */ CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), /* * Enable legacy local pids.events. */ CGRP_ROOT_PIDS_LOCAL_EVENTS = (1 << 20), }; /* cftype->flags */ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ __CFTYPE_ADDED = (1 << 18), }; /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can * be obtained by setting cftype->file_offset. */ struct cgroup_file { /* do not access any fields from outside cgroup core */ struct kernfs_node *kn; unsigned long notified_at; struct timer_list notify_timer; }; /* * Per-subsystem/per-cgroup state maintained by the system. This is the * fundamental structural building block that controllers deal with. * * Fields marked with "PI:" are public and immutable and may be accessed * directly without synchronization. */ struct cgroup_subsys_state { /* PI: the cgroup that this css is attached to */ struct cgroup *cgroup; /* PI: the cgroup subsystem that this css is attached to */ struct cgroup_subsys *ss; /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; /* * siblings list anchored at the parent's ->children * * linkage is protected by cgroup_mutex or RCU */ struct list_head sibling; struct list_head children; /* flush target list anchored at cgrp->rstat_css_list */ struct list_head rstat_css_node; /* * PI: Subsys-unique ID. 0 is unused and root is always 1. The * matching css can be looked up using css_from_id(). */ int id; unsigned int flags; /* * Monotonically increasing unique serial number which defines a * uniform order among all csses. It's guaranteed that all * ->children lists are in the ascending order of ->serial_nr and * used to allow interrupting and resuming iterations. */ u64 serial_nr; /* * Incremented by online self and children. Used to guarantee that * parents are not offlined before their children. */ atomic_t online_cnt; /* percpu_ref killing and RCU release */ struct work_struct destroy_work; struct rcu_work destroy_rwork; /* * PI: the parent css. Placed here for cache proximity to following * fields of the containing structure. */ struct cgroup_subsys_state *parent; /* * Keep track of total numbers of visible descendant CSSes. * The total number of dying CSSes is tracked in * css->cgroup->nr_dying_subsys[ssid]. * Protected by cgroup_mutex. */ int nr_descendants; }; /* * A css_set is a structure holding pointers to a set of * cgroup_subsys_state objects. This saves space in the task struct * object and speeds up fork()/exit(), since a single inc/dec and a * list_add()/del() can bump the reference count on the entire cgroup * set for a task. */ struct css_set { /* * Set of subsystem states, one for each subsystem. This array is * immutable after creation apart from the init_css_set during * subsystem registration (at boot time). */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; /* reference count */ refcount_t refcount; /* * For a domain cgroup, the following points to self. If threaded, * to the matching cset of the nearest domain ancestor. The * dom_cset provides access to the domain cgroup and its csses to * which domain level resource consumptions should be charged. */ struct css_set *dom_cset; /* the default cgroup associated with this css_set */ struct cgroup *dfl_cgrp; /* internal task count, protected by css_set_lock */ int nr_tasks; /* * Lists running through all tasks using this cgroup group. * mg_tasks lists tasks which belong to this cset but are in the * process of being migrated out or in. Protected by * css_set_lock, but, during migration, once tasks are moved to * mg_tasks, it can be read safely while holding cgroup_mutex. */ struct list_head tasks; struct list_head mg_tasks; struct list_head dying_tasks; /* all css_task_iters currently walking this cset */ struct list_head task_iters; /* * On the default hierarchy, ->subsys[ssid] may point to a css * attached to an ancestor instead of the cgroup this css_set is * associated with. The following node is anchored at * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to * iterate through all css's attached to a given cgroup. */ struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; /* all threaded csets whose ->dom_cset points to this cset */ struct list_head threaded_csets; struct list_head threaded_csets_node; /* * List running through all cgroup groups in the same hash * slot. Protected by css_set_lock */ struct hlist_node hlist; /* * List of cgrp_cset_links pointing at cgroups referenced from this * css_set. Protected by css_set_lock. */ struct list_head cgrp_links; /* * List of csets participating in the on-going migration either as * source or destination. Protected by cgroup_mutex. */ struct list_head mg_src_preload_node; struct list_head mg_dst_preload_node; struct list_head mg_node; /* * If this cset is acting as the source of migration the following * two fields are set. mg_src_cgrp and mg_dst_cgrp are * respectively the source and destination cgroups of the on-going * migration. mg_dst_cset is the destination cset the target tasks * on this cset should be migrated to. Protected by cgroup_mutex. */ struct cgroup *mg_src_cgrp; struct cgroup *mg_dst_cgrp; struct css_set *mg_dst_cset; /* dead and being drained, ignore for migration */ bool dead; /* For RCU-protected deletion */ struct rcu_head rcu_head; }; struct cgroup_base_stat { struct task_cputime cputime; #ifdef CONFIG_SCHED_CORE u64 forceidle_sum; #endif u64 ntime; }; /* * rstat - cgroup scalable recursive statistics. Accounting is done * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the * hierarchy on reads. * * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are * linked into the updated tree. On the following read, propagation only * considers and consumes the updated tree. This makes reading O(the * number of descendants which have been active since last read) instead of * O(the total number of descendants). * * This is important because there can be a lot of (draining) cgroups which * aren't active and stat may be read frequently. The combination can * become very expensive. By propagating selectively, increasing reading * frequency decreases the cost of each read. * * This struct hosts both the fields which implement the above - * updated_children and updated_next - and the fields which track basic * resource statistics on top of it - bsync, bstat and last_bstat. */ struct cgroup_rstat_cpu { /* * ->bsync protects ->bstat. These are the only fields which get * updated in the hot path. */ struct u64_stats_sync bsync; struct cgroup_base_stat bstat; /* * Snapshots at the last reading. These are used to calculate the * deltas to propagate to the global counters. */ struct cgroup_base_stat last_bstat; /* * This field is used to record the cumulative per-cpu time of * the cgroup and its descendants. Currently it can be read via * eBPF/drgn etc, and we are still trying to determine how to * expose it in the cgroupfs interface. */ struct cgroup_base_stat subtree_bstat; /* * Snapshots at the last reading. These are used to calculate the * deltas to propagate to the per-cpu subtree_bstat. */ struct cgroup_base_stat last_subtree_bstat; /* * Child cgroups with stat updates on this cpu since the last read * are linked on the parent's ->updated_children through * ->updated_next. * * In addition to being more compact, singly-linked list pointing * to the cgroup makes it unnecessary for each per-cpu struct to * point back to the associated cgroup. * * Protected by per-cpu cgroup_rstat_cpu_lock. */ struct cgroup *updated_children; /* terminated by self cgroup */ struct cgroup *updated_next; /* NULL iff not on the list */ }; struct cgroup_freezer_state { /* Should the cgroup and its descendants be frozen. */ bool freeze; /* Should the cgroup actually be frozen? */ bool e_freeze; /* Fields below are protected by css_set_lock */ /* Number of frozen descendant cgroups */ int nr_frozen_descendants; /* * Number of tasks, which are counted as frozen: * frozen, SIGSTOPped, and PTRACEd. */ int nr_frozen_tasks; }; struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; unsigned long flags; /* "unsigned long" so bitops work */ /* * The depth this cgroup is at. The root is at depth zero and each * step down the hierarchy increments the level. This along with * ancestors[] can determine whether a given cgroup is a * descendant of another without traversing the hierarchy. */ int level; /* Maximum allowed descent tree depth */ int max_depth; /* * Keep track of total numbers of visible and dying descent cgroups. * Dying cgroups are cgroups which were deleted by a user, * but are still existing because someone else is holding a reference. * max_descendants is a maximum allowed number of descent cgroups. * * nr_descendants and nr_dying_descendants are protected * by cgroup_mutex and css_set_lock. It's fine to read them holding * any of cgroup_mutex and css_set_lock; for writing both locks * should be held. */ int nr_descendants; int nr_dying_descendants; int max_descendants; /* * Each non-empty css_set associated with this cgroup contributes * one to nr_populated_csets. The counter is zero iff this cgroup * doesn't have any tasks. * * All children which have non-zero nr_populated_csets and/or * nr_populated_children of their own contribute one to either * nr_populated_domain_children or nr_populated_threaded_children * depending on their type. Each counter is zero iff all cgroups * of the type in the subtree proper don't have any tasks. */ int nr_populated_csets; int nr_populated_domain_children; int nr_populated_threaded_children; int nr_threaded_children; /* # of live threaded child cgroups */ struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ /* handles for "{cpu,memory,io,irq}.pressure" */ struct cgroup_file psi_files[NR_PSI_RESOURCES]; /* * The bitmask of subsystems enabled on the child cgroups. * ->subtree_control is the one configured through * "cgroup.subtree_control" while ->subtree_ss_mask is the effective * one which may have more subsystems enabled. Controller knobs * are made available iff it's enabled in ->subtree_control. */ u16 subtree_control; u16 subtree_ss_mask; u16 old_subtree_control; u16 old_subtree_ss_mask; /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; /* * Keep track of total number of dying CSSes at and below this cgroup. * Protected by cgroup_mutex. */ int nr_dying_subsys[CGROUP_SUBSYS_COUNT]; struct cgroup_root *root; /* * List of cgrp_cset_links pointing at css_sets with tasks in this * cgroup. Protected by css_set_lock. */ struct list_head cset_links; /* * On the default hierarchy, a css_set for a cgroup with some * susbsys disabled will point to css's which are associated with * the closest ancestor which has the subsys enabled. The * following lists all css_sets which point to this cgroup's css * for the given subsystem. */ struct list_head e_csets[CGROUP_SUBSYS_COUNT]; /* * If !threaded, self. If threaded, it points to the nearest * domain ancestor. Inside a threaded subtree, cgroups are exempt * from process granularity and no-internal-task constraint. * Domain level resource consumptions which aren't tied to a * specific task are charged to the dom_cgrp. */ struct cgroup *dom_cgrp; struct cgroup *old_dom_cgrp; /* used while enabling threaded */ /* per-cpu recursive resource statistics */ struct cgroup_rstat_cpu __percpu *rstat_cpu; struct list_head rstat_css_list; /* * Add padding to separate the read mostly rstat_cpu and * rstat_css_list into a different cacheline from the following * rstat_flush_next and *bstat fields which can have frequent updates. */ CACHELINE_PADDING(_pad_); /* * A singly-linked list of cgroup structures to be rstat flushed. * This is a scratch field to be used exclusively by * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock. */ struct cgroup *rstat_flush_next; /* cgroup basic resource statistics */ struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; struct prev_cputime prev_cputime; /* for printing out cputime */ /* * list of pidlists, up to two for each namespace (one for procs, one * for tasks); created on demand. */ struct list_head pidlists; struct mutex pidlist_mutex; /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; /* used to schedule release agent */ struct work_struct release_agent_work; /* used to track pressure stalls */ struct psi_group *psi; /* used to store eBPF programs */ struct cgroup_bpf bpf; /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *bpf_cgrp_storage; #endif /* All ancestors including self */ struct cgroup *ancestors[]; }; /* * A cgroup_root represents the root of a cgroup hierarchy, and may be * associated with a kernfs_root to form an active hierarchy. This is * internal to cgroup core. Don't access directly from controllers. */ struct cgroup_root { struct kernfs_root *kf_root; /* The bitmask of subsystems attached to this hierarchy */ unsigned int subsys_mask; /* Unique id for this hierarchy. */ int hierarchy_id; /* A list running through the active hierarchies */ struct list_head root_list; struct rcu_head rcu; /* Must be near the top */ /* * The root cgroup. The containing cgroup_root will be destroyed on its * release. cgrp->ancestors[0] will be used overflowing into the * following field. cgrp_ancestor_storage must immediately follow. */ struct cgroup cgrp; /* must follow cgrp for cgrp->ancestors[0], see above */ struct cgroup *cgrp_ancestor_storage; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; /* Hierarchy-specific flags */ unsigned int flags; /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; }; /* * struct cftype: handler definitions for cgroup control files * * When reading/writing to a file: * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata * - the 'cftype' of the file is file->f_path.dentry->d_fsdata */ struct cftype { /* * By convention, the name should begin with the name of the * subsystem, followed by a period. Zero length string indicates * end of cftype array. */ char name[MAX_CFTYPE_NAME]; unsigned long private; /* * The maximum length of string, excluding trailing nul, that can * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. */ size_t max_write_len; /* CFTYPE_* flags */ unsigned int flags; /* * If non-zero, should contain the offset from the start of css to * a struct cgroup_file field. cgroup will record the handle of * the created file into it. The recorded handle can be used as * long as the containing css remains accessible. */ unsigned int file_offset; /* * Fields used for internal bookkeeping. Initialized automatically * during registration. */ struct cgroup_subsys *ss; /* NULL for cgroup core files */ struct list_head node; /* anchored at ss->cfts */ struct kernfs_ops *kf_ops; int (*open)(struct kernfs_open_file *of); void (*release)(struct kernfs_open_file *of); /* * read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() */ u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); /* * read_s64() is a signed version of read_u64() */ s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); /* generic seq_file read interface */ int (*seq_show)(struct seq_file *sf, void *v); /* optional ops, implement all or none */ void *(*seq_start)(struct seq_file *sf, loff_t *ppos); void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); void (*seq_stop)(struct seq_file *sf, void *v); /* * write_u64() is a shortcut for the common case of accepting * a single integer (as parsed by simple_strtoull) from * userspace. Use in place of write(); return 0 or error. */ int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, u64 val); /* * write_s64() is a signed version of write_u64() */ int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, s64 val); /* * write() is the generic write callback which maps directly to * kernfs write operation and overrides all other operations. * Maximum write size is determined by ->max_write_len. Use * of_css/cft() to access the associated css and cft. */ ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); struct lock_class_key lockdep_key; }; /* * Control Group subsystem type. * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details */ struct cgroup_subsys { struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); int (*css_online)(struct cgroup_subsys_state *css); void (*css_offline)(struct cgroup_subsys_state *css); void (*css_released)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css); void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); int (*css_extra_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); int (*css_local_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); void (*post_attach)(void); int (*can_fork)(struct task_struct *task, struct css_set *cset); void (*cancel_fork)(struct task_struct *task, struct css_set *cset); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); void (*release)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); bool early_init:1; /* * If %true, the controller, on the default hierarchy, doesn't show * up in "cgroup.controllers" or "cgroup.subtree_control", is * implicitly enabled on all cgroups on the default hierarchy, and * bypasses the "no internal process" constraint. This is for * utility type controllers which is transparent to userland. * * An implicit controller can be stolen from the default hierarchy * anytime and thus must be okay with offline csses from previous * hierarchies coexisting with csses for the current one. */ bool implicit_on_dfl:1; /* * If %true, the controller, supports threaded mode on the default * hierarchy. In a threaded subtree, both process granularity and * no-internal-process constraint are ignored and a threaded * controllers should be able to handle that. * * Note that as an implicit controller is automatically enabled on * all cgroups on the default hierarchy, it should also be * threaded. implicit && !threaded is not supported. */ bool threaded:1; /* the following two fields are initialized automatically during boot */ int id; const char *name; /* optional, initialized automatically during boot if not set */ const char *legacy_name; /* link to parent, protected by cgroup_lock() */ struct cgroup_root *root; /* idr for css->id */ struct idr css_idr; /* * List of cftypes. Each entry is the first entry of an array * terminated by zero length name. */ struct list_head cfts; /* * Base cftypes which are automatically registered. The two can * point to the same array. */ struct cftype *dfl_cftypes; /* for the default hierarchy */ struct cftype *legacy_cftypes; /* for the legacy hierarchies */ /* * A subsystem may depend on other subsystems. When such subsystem * is enabled on a cgroup, the depended-upon subsystems are enabled * together if available. Subsystems enabled due to dependency are * not visible to userland until explicitly enabled. The following * specifies the mask of subsystems that this one depends on. */ unsigned int depends_on; }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; struct cgroup_of_peak { unsigned long value; struct list_head list; }; /** * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups * @tsk: target task * * Allows cgroup operations to synchronize against threadgroup changes * using a percpu_rw_semaphore. */ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { percpu_down_read(&cgroup_threadgroup_rwsem); } /** * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups * @tsk: target task * * Counterpart of cgroup_threadcgroup_change_begin(). */ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) { percpu_up_read(&cgroup_threadgroup_rwsem); } #else /* CONFIG_CGROUPS */ #define CGROUP_SUBSYS_COUNT 0 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { might_sleep(); } static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #endif /* CONFIG_CGROUPS */ #ifdef CONFIG_SOCK_CGROUP_DATA /* * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains * per-socket cgroup information except for memcg association. * * On legacy hierarchies, net_prio and net_cls controllers directly * set attributes on each sock which can then be tested by the network * layer. On the default hierarchy, each sock is associated with the * cgroup it was created in and the networking layer can match the * cgroup directly. */ struct sock_cgroup_data { struct cgroup *cgroup; /* v2 */ #ifdef CONFIG_CGROUP_NET_CLASSID u32 classid; /* v1 */ #endif #ifdef CONFIG_CGROUP_NET_PRIO u16 prioidx; /* v1 */ #endif }; static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) { #ifdef CONFIG_CGROUP_NET_PRIO return READ_ONCE(skcd->prioidx); #else return 1; #endif } static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { #ifdef CONFIG_CGROUP_NET_CLASSID return READ_ONCE(skcd->classid); #else return 0; #endif } static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { #ifdef CONFIG_CGROUP_NET_PRIO WRITE_ONCE(skcd->prioidx, prioidx); #endif } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { #ifdef CONFIG_CGROUP_NET_CLASSID WRITE_ONCE(skcd->classid, classid); #endif } #else /* CONFIG_SOCK_CGROUP_DATA */ struct sock_cgroup_data { }; #endif /* CONFIG_SOCK_CGROUP_DATA */ #endif /* _LINUX_CGROUP_DEFS_H */
8 8 201 200 246 137 137 133 136 137 137 136 137 137 224 223 201 200 201 201 221 223 2 5 5 223 32 224 218 218 220 10 8 8 219 27 221 224 220 220 219 217 8 35 36 36 223 31 32 32 32 30 27 8 8 27 27 224 218 36 36 36 36 224 224 223 224 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 2 7 23 26 26 26 3 26 26 26 26 5 5 26 26 26 5 24 24 1 24 24 5 24 26 26 26 5 26 26 26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions implement the sctp_outq class. The outqueue handles * bundling and queueing of outgoing SCTP chunks. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Perry Melange <pmelange@null.cc.uic.edu> * Xingang Guo <xingang.guo@intel.com> * Hui Huang <hui.huang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Jon Grimm <jgrimm@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/slab.h> #include <net/sock.h> /* For skb_set_owner_w */ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> #include <trace/events/sctp.h> /* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, union sctp_addr *saddr, struct sctp_sackhdr *sack, __u32 *highest_new_tsn); static void sctp_mark_missing(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, __u32 highest_new_tsn, int count_of_newacks); static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream_out_ext *oute; __u16 stream; list_add(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add(&ch->stream_list, &oute->outq); } /* Take data from the front of the queue. */ static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) { return q->sched->dequeue(q); } /* Add data chunk to the end of the queue. */ static inline void sctp_outq_tail_data(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream_out_ext *oute; __u16 stream; list_add_tail(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add_tail(&ch->stream_list, &oute->outq); } /* * SFR-CACC algorithm: * D) If count_of_newacks is greater than or equal to 2 * and t was not sent to the current primary then the * sender MUST NOT increment missing report count for t. */ static inline int sctp_cacc_skip_3_1_d(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks) { if (count_of_newacks >= 2 && transport != primary) return 1; return 0; } /* * SFR-CACC algorithm: * F) If count_of_newacks is less than 2, let d be the * destination to which t was sent. If cacc_saw_newack * is 0 for destination d, then the sender MUST NOT * increment missing report count for t. */ static inline int sctp_cacc_skip_3_1_f(struct sctp_transport *transport, int count_of_newacks) { if (count_of_newacks < 2 && (transport && !transport->cacc.cacc_saw_newack)) return 1; return 0; } /* * SFR-CACC algorithm: * 3.1) If CYCLING_CHANGEOVER is 0, the sender SHOULD * execute steps C, D, F. * * C has been implemented in sctp_outq_sack */ static inline int sctp_cacc_skip_3_1(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks) { if (!primary->cacc.cycling_changeover) { if (sctp_cacc_skip_3_1_d(primary, transport, count_of_newacks)) return 1; if (sctp_cacc_skip_3_1_f(transport, count_of_newacks)) return 1; return 0; } return 0; } /* * SFR-CACC algorithm: * 3.2) Else if CYCLING_CHANGEOVER is 1, and t is less * than next_tsn_at_change of the current primary, then * the sender MUST NOT increment missing report count * for t. */ static inline int sctp_cacc_skip_3_2(struct sctp_transport *primary, __u32 tsn) { if (primary->cacc.cycling_changeover && TSN_lt(tsn, primary->cacc.next_tsn_at_change)) return 1; return 0; } /* * SFR-CACC algorithm: * 3) If the missing report count for TSN t is to be * incremented according to [RFC2960] and * [SCTP_STEWART-2002], and CHANGEOVER_ACTIVE is set, * then the sender MUST further execute steps 3.1 and * 3.2 to determine if the missing report count for * TSN t SHOULD NOT be incremented. * * 3.3) If 3.1 and 3.2 do not dictate that the missing * report count for t should not be incremented, then * the sender SHOULD increment missing report count for * t (according to [RFC2960] and [SCTP_STEWART_2002]). */ static inline int sctp_cacc_skip(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks, __u32 tsn) { if (primary->cacc.changeover_active && (sctp_cacc_skip_3_1(primary, transport, count_of_newacks) || sctp_cacc_skip_3_2(primary, tsn))) return 1; return 0; } /* Initialize an existing sctp_outq. This does the boring stuff. * You still need to define handlers if you really want to DO * something with this structure... */ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) { memset(q, 0, sizeof(struct sctp_outq)); q->asoc = asoc; INIT_LIST_HEAD(&q->out_chunk_list); INIT_LIST_HEAD(&q->control_chunk_list); INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); sctp_sched_set_sched(asoc, sctp_sk(asoc->base.sk)->default_ss); } /* Free the outqueue structure and any related pending chunks. */ static void __sctp_outq_teardown(struct sctp_outq *q) { struct sctp_transport *transport; struct list_head *lchunk, *temp; struct sctp_chunk *chunk, *tmp; /* Throw away unacknowledged chunks. */ list_for_each_entry(transport, &q->asoc->peer.transport_addr_list, transports) { while ((lchunk = sctp_list_dequeue(&transport->transmitted)) != NULL) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); /* Mark as part of a failed message. */ sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } } /* Throw away chunks that have been gap ACKed. */ list_for_each_safe(lchunk, temp, &q->sacked) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any chunks in the retransmit queue. */ list_for_each_safe(lchunk, temp, &q->retransmit) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any chunks that are in the abandoned queue. */ list_for_each_safe(lchunk, temp, &q->abandoned) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any leftover data chunks. */ while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { sctp_sched_dequeue_done(q, chunk); /* Mark as send failure. */ sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any leftover control chunks. */ list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } } void sctp_outq_teardown(struct sctp_outq *q) { __sctp_outq_teardown(q); sctp_outq_init(q->asoc, q); } /* Free the outqueue structure and any related pending chunks. */ void sctp_outq_free(struct sctp_outq *q) { /* Throw away leftover chunks. */ __sctp_outq_teardown(q); } /* Put a new chunk in an sctp_outq. */ void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) { struct net *net = q->asoc->base.net; pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk"); /* If it is data, queue it up, otherwise, send it * immediately. */ if (sctp_chunk_is_data(chunk)) { pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk"); sctp_outq_tail_data(q, chunk); if (chunk->asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) chunk->asoc->sent_cnt_removable++; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); else SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS); } else { list_add_tail(&chunk->list, &q->control_chunk_list); SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); } if (!q->cork) sctp_outq_flush(q, 0, gfp); } /* Insert a chunk into the sorted list based on the TSNs. The retransmit list * and the abandoned list are in ascending order. */ static void sctp_insert_list(struct list_head *head, struct list_head *new) { struct list_head *pos; struct sctp_chunk *nchunk, *lchunk; __u32 ntsn, ltsn; int done = 0; nchunk = list_entry(new, struct sctp_chunk, transmitted_list); ntsn = ntohl(nchunk->subh.data_hdr->tsn); list_for_each(pos, head) { lchunk = list_entry(pos, struct sctp_chunk, transmitted_list); ltsn = ntohl(lchunk->subh.data_hdr->tsn); if (TSN_lt(ntsn, ltsn)) { list_add(new, pos->prev); done = 1; break; } } if (!done) list_add_tail(new, head); } static int sctp_prsctp_prune_sent(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct list_head *queue, int msg_len) { struct sctp_chunk *chk, *temp; list_for_each_entry_safe(chk, temp, queue, transmitted_list) { struct sctp_stream_out *streamout; if (!chk->msg->abandoned && (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; chk->msg->abandoned = 1; list_del_init(&chk->transmitted_list); sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); streamout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; if (queue != &asoc->outqueue.retransmit && !chk->tsn_gap_acked) { if (chk->transport) chk->transport->flight_size -= sctp_data_size(chk); asoc->outqueue.outstanding_bytes -= sctp_data_size(chk); } msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); if (msg_len <= 0) break; } return msg_len; } static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len) { struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chk, *temp; struct sctp_stream_out *sout; q->sched->unsched_all(&asoc->stream); list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!chk->msg->abandoned && (!(chk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) || !SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; chk->msg->abandoned = 1; sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; sout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); sout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; /* clear out_curr if all frag chunks are pruned */ if (asoc->stream.out_curr == sout && list_is_last(&chk->frag_list, &chk->msg->chunks)) asoc->stream.out_curr = NULL; msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); sctp_chunk_free(chk); if (msg_len <= 0) break; } q->sched->sched_all(&asoc->stream); return msg_len; } /* Abandon the chunks according their priorities */ void sctp_prsctp_prune(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len) { struct sctp_transport *transport; if (!asoc->peer.prsctp_capable || !asoc->sent_cnt_removable) return; msg_len = sctp_prsctp_prune_sent(asoc, sinfo, &asoc->outqueue.retransmit, msg_len); if (msg_len <= 0) return; list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { msg_len = sctp_prsctp_prune_sent(asoc, sinfo, &transport->transmitted, msg_len); if (msg_len <= 0) return; } sctp_prsctp_prune_unsent(asoc, sinfo, msg_len); } /* Mark all the eligible packets on a transport for retransmission. */ void sctp_retransmit_mark(struct sctp_outq *q, struct sctp_transport *transport, __u8 reason) { struct list_head *lchunk, *ltemp; struct sctp_chunk *chunk; /* Walk through the specified transmitted queue. */ list_for_each_safe(lchunk, ltemp, &transport->transmitted) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); /* If the chunk is abandoned, move it to abandoned list. */ if (sctp_chunk_abandoned(chunk)) { list_del_init(lchunk); sctp_insert_list(&q->abandoned, lchunk); /* If this chunk has not been previousely acked, * stop considering it 'outstanding'. Our peer * will most likely never see it since it will * not be retransmitted */ if (!chunk->tsn_gap_acked) { if (chunk->transport) chunk->transport->flight_size -= sctp_data_size(chunk); q->outstanding_bytes -= sctp_data_size(chunk); q->asoc->peer.rwnd += sctp_data_size(chunk); } continue; } /* If we are doing retransmission due to a timeout or pmtu * discovery, only the chunks that are not yet acked should * be added to the retransmit queue. */ if ((reason == SCTP_RTXR_FAST_RTX && (chunk->fast_retransmit == SCTP_NEED_FRTX)) || (reason != SCTP_RTXR_FAST_RTX && !chunk->tsn_gap_acked)) { /* RFC 2960 6.2.1 Processing a Received SACK * * C) Any time a DATA chunk is marked for * retransmission (via either T3-rtx timer expiration * (Section 6.3.3) or via fast retransmit * (Section 7.2.4)), add the data size of those * chunks to the rwnd. */ q->asoc->peer.rwnd += sctp_data_size(chunk); q->outstanding_bytes -= sctp_data_size(chunk); if (chunk->transport) transport->flight_size -= sctp_data_size(chunk); /* sctpimpguide-05 Section 2.8.2 * M5) If a T3-rtx timer expires, the * 'TSN.Missing.Report' of all affected TSNs is set * to 0. */ chunk->tsn_missing_report = 0; /* If a chunk that is being used for RTT measurement * has to be retransmitted, we cannot use this chunk * anymore for RTT measurements. Reset rto_pending so * that a new RTT measurement is started when a new * data chunk is sent. */ if (chunk->rtt_in_progress) { chunk->rtt_in_progress = 0; transport->rto_pending = 0; } /* Move the chunk to the retransmit queue. The chunks * on the retransmit queue are always kept in order. */ list_del_init(lchunk); sctp_insert_list(&q->retransmit, lchunk); } } pr_debug("%s: transport:%p, reason:%d, cwnd:%d, ssthresh:%d, " "flight_size:%d, pba:%d\n", __func__, transport, reason, transport->cwnd, transport->ssthresh, transport->flight_size, transport->partial_bytes_acked); } /* Mark all the eligible packets on a transport for retransmission and force * one packet out. */ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, enum sctp_retransmit_reason reason) { struct net *net = q->asoc->base.net; switch (reason) { case SCTP_RTXR_T3_RTX: SCTP_INC_STATS(net, SCTP_MIB_T3_RETRANSMITS); sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_T3_RTX); /* Update the retran path if the T3-rtx timer has expired for * the current retran path. */ if (transport == transport->asoc->peer.retran_path) sctp_assoc_update_retran_path(transport->asoc); transport->asoc->rtx_data_chunks += transport->asoc->unack_data; if (transport->pl.state == SCTP_PL_COMPLETE && transport->asoc->unack_data) sctp_transport_reset_probe_timer(transport); break; case SCTP_RTXR_FAST_RTX: SCTP_INC_STATS(net, SCTP_MIB_FAST_RETRANSMITS); sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX); q->fast_rtx = 1; break; case SCTP_RTXR_PMTUD: SCTP_INC_STATS(net, SCTP_MIB_PMTUD_RETRANSMITS); break; case SCTP_RTXR_T1_RTX: SCTP_INC_STATS(net, SCTP_MIB_T1_RETRANSMITS); transport->asoc->init_retries++; break; default: BUG(); } sctp_retransmit_mark(q, transport, reason); /* PR-SCTP A5) Any time the T3-rtx timer expires, on any destination, * the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by * following the procedures outlined in C1 - C5. */ if (reason == SCTP_RTXR_T3_RTX) q->asoc->stream.si->generate_ftsn(q, q->asoc->ctsn_ack_point); /* Flush the queues only on timeout, since fast_rtx is only * triggered during sack processing and the queue * will be flushed at the end. */ if (reason != SCTP_RTXR_FAST_RTX) sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC); } /* * Transmit DATA chunks on the retransmit queue. Upon return from * __sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which * need to be transmitted by the caller. * We assume that pkt->transport has already been set. * * The return value is a normal kernel error return value. */ static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, int rtx_timeout, int *start_timer, gfp_t gfp) { struct sctp_transport *transport = pkt->transport; struct sctp_chunk *chunk, *chunk1; struct list_head *lqueue; enum sctp_xmit status; int error = 0; int timer = 0; int done = 0; int fast_rtx; lqueue = &q->retransmit; fast_rtx = q->fast_rtx; /* This loop handles time-out retransmissions, fast retransmissions, * and retransmissions due to opening of whindow. * * RFC 2960 6.3.3 Handle T3-rtx Expiration * * E3) Determine how many of the earliest (i.e., lowest TSN) * outstanding DATA chunks for the address for which the * T3-rtx has expired will fit into a single packet, subject * to the MTU constraint for the path corresponding to the * destination transport address to which the retransmission * is being sent (this may be different from the address for * which the timer expires [see Section 6.4]). Call this value * K. Bundle and retransmit those K DATA chunks in a single * packet to the destination endpoint. * * [Just to be painfully clear, if we are retransmitting * because a timeout just happened, we should send only ONE * packet of retransmitted data.] * * For fast retransmissions we also send only ONE packet. However, * if we are just flushing the queue due to open window, we'll * try to send as much as possible. */ list_for_each_entry_safe(chunk, chunk1, lqueue, transmitted_list) { /* If the chunk is abandoned, move it to abandoned list. */ if (sctp_chunk_abandoned(chunk)) { list_del_init(&chunk->transmitted_list); sctp_insert_list(&q->abandoned, &chunk->transmitted_list); continue; } /* Make sure that Gap Acked TSNs are not retransmitted. A * simple approach is just to move such TSNs out of the * way and into a 'transmitted' queue and skip to the * next chunk. */ if (chunk->tsn_gap_acked) { list_move_tail(&chunk->transmitted_list, &transport->transmitted); continue; } /* If we are doing fast retransmit, ignore non-fast_rtransmit * chunks */ if (fast_rtx && !chunk->fast_retransmit) continue; redo: /* Attempt to append this chunk to the packet. */ status = sctp_packet_append_chunk(pkt, chunk); switch (status) { case SCTP_XMIT_PMTU_FULL: if (!pkt->has_data && !pkt->has_cookie_echo) { /* If this packet did not contain DATA then * retransmission did not happen, so do it * again. We'll ignore the error here since * control chunks are already freed so there * is nothing we can do. */ sctp_packet_transmit(pkt, gfp); goto redo; } /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* If we are retransmitting, we should only * send a single packet. * Otherwise, try appending this chunk again. */ if (rtx_timeout || fast_rtx) done = 1; else goto redo; /* Bundle next chunk in the next round. */ break; case SCTP_XMIT_RWND_FULL: /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* Stop sending DATA as there is no more room * at the receiver. */ done = 1; break; case SCTP_XMIT_DELAY: /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* Stop sending DATA because of nagle delay. */ done = 1; break; default: /* The append was successful, so add this chunk to * the transmitted list. */ list_move_tail(&chunk->transmitted_list, &transport->transmitted); /* Mark the chunk as ineligible for fast retransmit * after it is retransmitted. */ if (chunk->fast_retransmit == SCTP_NEED_FRTX) chunk->fast_retransmit = SCTP_DONT_FRTX; q->asoc->stats.rtxchunks++; break; } /* Set the timer if there were no errors */ if (!error && !timer) timer = 1; if (done) break; } /* If we are here due to a retransmit timeout or a fast * retransmit and if there are any chunks left in the retransmit * queue that could not fit in the PMTU sized packet, they need * to be marked as ineligible for a subsequent fast retransmit. */ if (rtx_timeout || fast_rtx) { list_for_each_entry(chunk1, lqueue, transmitted_list) { if (chunk1->fast_retransmit == SCTP_NEED_FRTX) chunk1->fast_retransmit = SCTP_DONT_FRTX; } } *start_timer = timer; /* Clear fast retransmit hint */ if (fast_rtx) q->fast_rtx = 0; return error; } /* Cork the outqueue so queued chunks are really queued. */ void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) { if (q->cork) q->cork = 0; sctp_outq_flush(q, 0, gfp); } static int sctp_packet_singleton(struct sctp_transport *transport, struct sctp_chunk *chunk, gfp_t gfp) { const struct sctp_association *asoc = transport->asoc; const __u16 sport = asoc->base.bind_addr.port; const __u16 dport = asoc->peer.port; const __u32 vtag = asoc->peer.i.init_tag; struct sctp_packet singleton; sctp_packet_init(&singleton, transport, sport, dport); sctp_packet_config(&singleton, vtag, 0); if (sctp_packet_append_chunk(&singleton, chunk) != SCTP_XMIT_OK) { list_del_init(&chunk->list); sctp_chunk_free(chunk); return -ENOMEM; } return sctp_packet_transmit(&singleton, gfp); } /* Struct to hold the context during sctp outq flush */ struct sctp_flush_ctx { struct sctp_outq *q; /* Current transport being used. It's NOT the same as curr active one */ struct sctp_transport *transport; /* These transports have chunks to send. */ struct list_head transport_list; struct sctp_association *asoc; /* Packet on the current transport above */ struct sctp_packet *packet; gfp_t gfp; }; /* transport: current transport */ static void sctp_outq_select_transport(struct sctp_flush_ctx *ctx, struct sctp_chunk *chunk) { struct sctp_transport *new_transport = chunk->transport; if (!new_transport) { if (!sctp_chunk_is_data(chunk)) { /* If we have a prior transport pointer, see if * the destination address of the chunk * matches the destination address of the * current transport. If not a match, then * try to look up the transport with a given * destination address. We do this because * after processing ASCONFs, we may have new * transports created. */ if (ctx->transport && sctp_cmp_addr_exact(&chunk->dest, &ctx->transport->ipaddr)) new_transport = ctx->transport; else new_transport = sctp_assoc_lookup_paddr(ctx->asoc, &chunk->dest); } /* if we still don't have a new transport, then * use the current active path. */ if (!new_transport) new_transport = ctx->asoc->peer.active_path; } else { __u8 type; switch (new_transport->state) { case SCTP_INACTIVE: case SCTP_UNCONFIRMED: case SCTP_PF: /* If the chunk is Heartbeat or Heartbeat Ack, * send it to chunk->transport, even if it's * inactive. * * 3.3.6 Heartbeat Acknowledgement: * ... * A HEARTBEAT ACK is always sent to the source IP * address of the IP datagram containing the * HEARTBEAT chunk to which this ack is responding. * ... * * ASCONF_ACKs also must be sent to the source. */ type = chunk->chunk_hdr->type; if (type != SCTP_CID_HEARTBEAT && type != SCTP_CID_HEARTBEAT_ACK && type != SCTP_CID_ASCONF_ACK) new_transport = ctx->asoc->peer.active_path; break; default: break; } } /* Are we switching transports? Take care of transport locks. */ if (new_transport != ctx->transport) { ctx->transport = new_transport; ctx->packet = &ctx->transport->packet; if (list_empty(&ctx->transport->send_ready)) list_add_tail(&ctx->transport->send_ready, &ctx->transport_list); sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag, ctx->asoc->peer.ecn_capable); /* We've switched transports, so apply the * Burst limit to the new transport. */ sctp_transport_burst_limited(ctx->transport); } } static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx) { struct sctp_chunk *chunk, *tmp; enum sctp_xmit status; int one_packet, error; list_for_each_entry_safe(chunk, tmp, &ctx->q->control_chunk_list, list) { one_packet = 0; /* RFC 5061, 5.3 * F1) This means that until such time as the ASCONF * containing the add is acknowledged, the sender MUST * NOT use the new IP address as a source for ANY SCTP * packet except on carrying an ASCONF Chunk. */ if (ctx->asoc->src_out_of_asoc_ok && chunk->chunk_hdr->type != SCTP_CID_ASCONF) continue; list_del_init(&chunk->list); /* Pick the right transport to use. Should always be true for * the first chunk as we don't have a transport by then. */ sctp_outq_select_transport(ctx, chunk); switch (chunk->chunk_hdr->type) { /* 6.10 Bundling * ... * An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN * COMPLETE with any other chunks. [Send them immediately.] */ case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: error = sctp_packet_singleton(ctx->transport, chunk, ctx->gfp); if (error < 0) { ctx->asoc->base.sk->sk_err = -error; return; } ctx->asoc->stats.octrlchunks++; break; case SCTP_CID_ABORT: if (sctp_test_T_bit(chunk)) ctx->packet->vtag = ctx->asoc->c.my_vtag; fallthrough; /* The following chunks are "response" chunks, i.e. * they are generated in response to something we * received. If we are sending these, then we can * send only 1 packet containing these chunks. */ case SCTP_CID_HEARTBEAT_ACK: case SCTP_CID_SHUTDOWN_ACK: case SCTP_CID_COOKIE_ACK: case SCTP_CID_COOKIE_ECHO: case SCTP_CID_ERROR: case SCTP_CID_ECN_CWR: case SCTP_CID_ASCONF_ACK: one_packet = 1; fallthrough; case SCTP_CID_HEARTBEAT: if (chunk->pmtu_probe) { error = sctp_packet_singleton(ctx->transport, chunk, ctx->gfp); if (!error) ctx->asoc->stats.octrlchunks++; break; } fallthrough; case SCTP_CID_SACK: case SCTP_CID_SHUTDOWN: case SCTP_CID_ECN_ECNE: case SCTP_CID_ASCONF: case SCTP_CID_FWD_TSN: case SCTP_CID_I_FWD_TSN: case SCTP_CID_RECONF: status = sctp_packet_transmit_chunk(ctx->packet, chunk, one_packet, ctx->gfp); if (status != SCTP_XMIT_OK) { /* put the chunk back */ list_add(&chunk->list, &ctx->q->control_chunk_list); break; } ctx->asoc->stats.octrlchunks++; /* PR-SCTP C5) If a FORWARD TSN is sent, the * sender MUST assure that at least one T3-rtx * timer is running. */ if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN || chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) { sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; } if (chunk == ctx->asoc->strreset_chunk) sctp_transport_reset_reconf_timer(ctx->transport); break; default: /* We built a chunk with an illegal type! */ BUG(); } } } /* Returns false if new data shouldn't be sent */ static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx, int rtx_timeout) { int error, start_timer = 0; if (ctx->asoc->peer.retran_path->state == SCTP_UNCONFIRMED) return false; if (ctx->transport != ctx->asoc->peer.retran_path) { /* Switch transports & prepare the packet. */ ctx->transport = ctx->asoc->peer.retran_path; ctx->packet = &ctx->transport->packet; if (list_empty(&ctx->transport->send_ready)) list_add_tail(&ctx->transport->send_ready, &ctx->transport_list); sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag, ctx->asoc->peer.ecn_capable); } error = __sctp_outq_flush_rtx(ctx->q, ctx->packet, rtx_timeout, &start_timer, ctx->gfp); if (error < 0) ctx->asoc->base.sk->sk_err = -error; if (start_timer) { sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; } /* This can happen on COOKIE-ECHO resend. Only * one chunk can get bundled with a COOKIE-ECHO. */ if (ctx->packet->has_cookie_echo) return false; /* Don't send new data if there is still data * waiting to retransmit. */ if (!list_empty(&ctx->q->retransmit)) return false; return true; } static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, int rtx_timeout) { struct sctp_chunk *chunk; enum sctp_xmit status; /* Is it OK to send data chunks? */ switch (ctx->asoc->state) { case SCTP_STATE_COOKIE_ECHOED: /* Only allow bundling when this packet has a COOKIE-ECHO * chunk. */ if (!ctx->packet || !ctx->packet->has_cookie_echo) return; fallthrough; case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: case SCTP_STATE_SHUTDOWN_RECEIVED: break; default: /* Do nothing. */ return; } /* RFC 2960 6.1 Transmission of DATA Chunks * * C) When the time comes for the sender to transmit, * before sending new DATA chunks, the sender MUST * first transmit any outstanding DATA chunks which * are marked for retransmission (limited by the * current cwnd). */ if (!list_empty(&ctx->q->retransmit) && !sctp_outq_flush_rtx(ctx, rtx_timeout)) return; /* Apply Max.Burst limitation to the current transport in * case it will be used for new data. We are going to * rest it before we return, but we want to apply the limit * to the currently queued data. */ if (ctx->transport) sctp_transport_burst_limited(ctx->transport); /* Finally, transmit new packets. */ while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) { __u32 sid = ntohs(chunk->subh.data_hdr->stream); __u8 stream_state = SCTP_SO(&ctx->asoc->stream, sid)->state; /* Has this chunk expired? */ if (sctp_chunk_abandoned(chunk)) { sctp_sched_dequeue_done(ctx->q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; } if (stream_state == SCTP_STREAM_CLOSED) { sctp_outq_head_data(ctx->q, chunk); break; } sctp_outq_select_transport(ctx, chunk); pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p skb->users:%d\n", __func__, ctx->q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk", ntohl(chunk->subh.data_hdr->tsn), chunk->skb ? chunk->skb->head : NULL, chunk->skb ? refcount_read(&chunk->skb->users) : -1); /* Add the chunk to the packet. */ status = sctp_packet_transmit_chunk(ctx->packet, chunk, 0, ctx->gfp); if (status != SCTP_XMIT_OK) { /* We could not append this chunk, so put * the chunk back on the output queue. */ pr_debug("%s: could not transmit tsn:0x%x, status:%d\n", __func__, ntohl(chunk->subh.data_hdr->tsn), status); sctp_outq_head_data(ctx->q, chunk); break; } /* The sender is in the SHUTDOWN-PENDING state, * The sender MAY set the I-bit in the DATA * chunk header. */ if (ctx->asoc->state == SCTP_STATE_SHUTDOWN_PENDING) chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) ctx->asoc->stats.ouodchunks++; else ctx->asoc->stats.oodchunks++; /* Only now it's safe to consider this * chunk as sent, sched-wise. */ sctp_sched_dequeue_done(ctx->q, chunk); list_add_tail(&chunk->transmitted_list, &ctx->transport->transmitted); sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; /* Only let one DATA chunk get bundled with a * COOKIE-ECHO chunk. */ if (ctx->packet->has_cookie_echo) break; } } static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx) { struct sock *sk = ctx->asoc->base.sk; struct list_head *ltransport; struct sctp_packet *packet; struct sctp_transport *t; int error = 0; while ((ltransport = sctp_list_dequeue(&ctx->transport_list)) != NULL) { t = list_entry(ltransport, struct sctp_transport, send_ready); packet = &t->packet; if (!sctp_packet_empty(packet)) { rcu_read_lock(); if (t->dst && __sk_dst_get(sk) != t->dst) { dst_hold(t->dst); sk_setup_caps(sk, t->dst); } rcu_read_unlock(); error = sctp_packet_transmit(packet, ctx->gfp); if (error < 0) ctx->q->asoc->base.sk->sk_err = -error; } /* Clear the burst limited state, if any */ sctp_transport_burst_reset(t); } } /* Try to flush an outqueue. * * Description: Send everything in q which we legally can, subject to * congestion limitations. * * Note: This function can be called from multiple contexts so appropriate * locking concerns must be made. Today we use the sock lock to protect * this function. */ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) { struct sctp_flush_ctx ctx = { .q = q, .transport = NULL, .transport_list = LIST_HEAD_INIT(ctx.transport_list), .asoc = q->asoc, .packet = NULL, .gfp = gfp, }; /* 6.10 Bundling * ... * When bundling control chunks with DATA chunks, an * endpoint MUST place control chunks first in the outbound * SCTP packet. The transmitter MUST transmit DATA chunks * within a SCTP packet in increasing order of TSN. * ... */ sctp_outq_flush_ctrl(&ctx); if (q->asoc->src_out_of_asoc_ok) goto sctp_flush_out; sctp_outq_flush_data(&ctx, rtx_timeout); sctp_flush_out: sctp_outq_flush_transports(&ctx); } /* Update unack_data based on the incoming SACK chunk */ static void sctp_sack_update_unack_data(struct sctp_association *assoc, struct sctp_sackhdr *sack) { union sctp_sack_variable *frags; __u16 unack_data; int i; unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1; frags = (union sctp_sack_variable *)(sack + 1); for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) { unack_data -= ((ntohs(frags[i].gab.end) - ntohs(frags[i].gab.start) + 1)); } assoc->unack_data = unack_data; } /* This is where we REALLY process a SACK. * * Process the SACK against the outqueue. Mostly, this just frees * things off the transmitted queue. */ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) { struct sctp_association *asoc = q->asoc; struct sctp_sackhdr *sack = chunk->subh.sack_hdr; struct sctp_transport *transport; struct sctp_chunk *tchunk = NULL; struct list_head *lchunk, *transport_list, *temp; __u32 sack_ctsn, ctsn, tsn; __u32 highest_tsn, highest_new_tsn; __u32 sack_a_rwnd; unsigned int outstanding; struct sctp_transport *primary = asoc->peer.primary_path; int count_of_newacks = 0; int gap_ack_blocks; u8 accum_moved = 0; /* Grab the association's destination address list. */ transport_list = &asoc->peer.transport_addr_list; /* SCTP path tracepoint for congestion control debugging. */ if (trace_sctp_probe_path_enabled()) { list_for_each_entry(transport, transport_list, transports) trace_sctp_probe_path(transport, asoc); } sack_ctsn = ntohl(sack->cum_tsn_ack); gap_ack_blocks = ntohs(sack->num_gap_ack_blocks); asoc->stats.gapcnt += gap_ack_blocks; /* * SFR-CACC algorithm: * On receipt of a SACK the sender SHOULD execute the * following statements. * * 1) If the cumulative ack in the SACK passes next tsn_at_change * on the current primary, the CHANGEOVER_ACTIVE flag SHOULD be * cleared. The CYCLING_CHANGEOVER flag SHOULD also be cleared for * all destinations. * 2) If the SACK contains gap acks and the flag CHANGEOVER_ACTIVE * is set the receiver of the SACK MUST take the following actions: * * A) Initialize the cacc_saw_newack to 0 for all destination * addresses. * * Only bother if changeover_active is set. Otherwise, this is * totally suboptimal to do on every SACK. */ if (primary->cacc.changeover_active) { u8 clear_cycling = 0; if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) { primary->cacc.changeover_active = 0; clear_cycling = 1; } if (clear_cycling || gap_ack_blocks) { list_for_each_entry(transport, transport_list, transports) { if (clear_cycling) transport->cacc.cycling_changeover = 0; if (gap_ack_blocks) transport->cacc.cacc_saw_newack = 0; } } } /* Get the highest TSN in the sack. */ highest_tsn = sack_ctsn; if (gap_ack_blocks) { union sctp_sack_variable *frags = (union sctp_sack_variable *)(sack + 1); highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end); } if (TSN_lt(asoc->highest_sacked, highest_tsn)) asoc->highest_sacked = highest_tsn; highest_new_tsn = sack_ctsn; /* Run through the retransmit queue. Credit bytes received * and free those chunks that we can. */ sctp_check_transmitted(q, &q->retransmit, NULL, NULL, sack, &highest_new_tsn); /* Run through the transmitted queue. * Credit bytes received and free those chunks which we can. * * This is a MASSIVE candidate for optimization. */ list_for_each_entry(transport, transport_list, transports) { sctp_check_transmitted(q, &transport->transmitted, transport, &chunk->source, sack, &highest_new_tsn); /* * SFR-CACC algorithm: * C) Let count_of_newacks be the number of * destinations for which cacc_saw_newack is set. */ if (transport->cacc.cacc_saw_newack) count_of_newacks++; } /* Move the Cumulative TSN Ack Point if appropriate. */ if (TSN_lt(asoc->ctsn_ack_point, sack_ctsn)) { asoc->ctsn_ack_point = sack_ctsn; accum_moved = 1; } if (gap_ack_blocks) { if (asoc->fast_recovery && accum_moved) highest_new_tsn = highest_tsn; list_for_each_entry(transport, transport_list, transports) sctp_mark_missing(q, &transport->transmitted, transport, highest_new_tsn, count_of_newacks); } /* Update unack_data field in the assoc. */ sctp_sack_update_unack_data(asoc, sack); ctsn = asoc->ctsn_ack_point; /* Throw away stuff rotting on the sack queue. */ list_for_each_safe(lchunk, temp, &q->sacked) { tchunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(tchunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(&tchunk->transmitted_list); if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; sctp_chunk_free(tchunk); } } /* ii) Set rwnd equal to the newly received a_rwnd minus the * number of bytes still outstanding after processing the * Cumulative TSN Ack and the Gap Ack Blocks. */ sack_a_rwnd = ntohl(sack->a_rwnd); asoc->peer.zero_window_announced = !sack_a_rwnd; outstanding = q->outstanding_bytes; if (outstanding < sack_a_rwnd) sack_a_rwnd -= outstanding; else sack_a_rwnd = 0; asoc->peer.rwnd = sack_a_rwnd; asoc->stream.si->generate_ftsn(q, sack_ctsn); pr_debug("%s: sack cumulative tsn ack:0x%x\n", __func__, sack_ctsn); pr_debug("%s: cumulative tsn ack of assoc:%p is 0x%x, " "advertised peer ack point:0x%x\n", __func__, asoc, ctsn, asoc->adv_peer_ack_point); return sctp_outq_is_empty(q); } /* Is the outqueue empty? * The queue is empty when we have not pending data, no in-flight data * and nothing pending retransmissions. */ int sctp_outq_is_empty(const struct sctp_outq *q) { return q->out_qlen == 0 && q->outstanding_bytes == 0 && list_empty(&q->retransmit); } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Go through a transport's transmitted list or the association's retransmit * list and move chunks that are acked by the Cumulative TSN Ack to q->sacked. * The retransmit list will not have an associated transport. * * I added coherent debug information output. --xguo * * Instead of printing 'sacked' or 'kept' for each TSN on the * transmitted_queue, we print a range: SACKED: TSN1-TSN2, TSN3, TSN4-TSN5. * KEPT TSN6-TSN7, etc. */ static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, union sctp_addr *saddr, struct sctp_sackhdr *sack, __u32 *highest_new_tsn_in_sack) { struct list_head *lchunk; struct sctp_chunk *tchunk; struct list_head tlist; __u32 tsn; __u32 sack_ctsn; __u32 rtt; __u8 restart_timer = 0; int bytes_acked = 0; int migrate_bytes = 0; bool forward_progress = false; sack_ctsn = ntohl(sack->cum_tsn_ack); INIT_LIST_HEAD(&tlist); /* The while loop will skip empty transmitted queues. */ while (NULL != (lchunk = sctp_list_dequeue(transmitted_queue))) { tchunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); if (sctp_chunk_abandoned(tchunk)) { /* Move the chunk to abandoned list. */ sctp_insert_list(&q->abandoned, lchunk); /* If this chunk has not been acked, stop * considering it as 'outstanding'. */ if (transmitted_queue != &q->retransmit && !tchunk->tsn_gap_acked) { if (tchunk->transport) tchunk->transport->flight_size -= sctp_data_size(tchunk); q->outstanding_bytes -= sctp_data_size(tchunk); } continue; } tsn = ntohl(tchunk->subh.data_hdr->tsn); if (sctp_acked(sack, tsn)) { /* If this queue is the retransmit queue, the * retransmit timer has already reclaimed * the outstanding bytes for this chunk, so only * count bytes associated with a transport. */ if (transport && !tchunk->tsn_gap_acked) { /* If this chunk is being used for RTT * measurement, calculate the RTT and update * the RTO using this value. * * 6.3.1 C5) Karn's algorithm: RTT measurements * MUST NOT be made using packets that were * retransmitted (and thus for which it is * ambiguous whether the reply was for the * first instance of the packet or a later * instance). */ if (!sctp_chunk_retransmitted(tchunk) && tchunk->rtt_in_progress) { tchunk->rtt_in_progress = 0; rtt = jiffies - tchunk->sent_at; sctp_transport_update_rto(transport, rtt); } if (TSN_lte(tsn, sack_ctsn)) { /* * SFR-CACC algorithm: * 2) If the SACK contains gap acks * and the flag CHANGEOVER_ACTIVE is * set the receiver of the SACK MUST * take the following action: * * B) For each TSN t being acked that * has not been acked in any SACK so * far, set cacc_saw_newack to 1 for * the destination that the TSN was * sent to. */ if (sack->num_gap_ack_blocks && q->asoc->peer.primary_path->cacc. changeover_active) transport->cacc.cacc_saw_newack = 1; } } /* If the chunk hasn't been marked as ACKED, * mark it and account bytes_acked if the * chunk had a valid transport (it will not * have a transport if ASCONF had deleted it * while DATA was outstanding). */ if (!tchunk->tsn_gap_acked) { tchunk->tsn_gap_acked = 1; if (TSN_lt(*highest_new_tsn_in_sack, tsn)) *highest_new_tsn_in_sack = tsn; bytes_acked += sctp_data_size(tchunk); if (!tchunk->transport) migrate_bytes += sctp_data_size(tchunk); forward_progress = true; } if (TSN_lte(tsn, sack_ctsn)) { /* RFC 2960 6.3.2 Retransmission Timer Rules * * R3) Whenever a SACK is received * that acknowledges the DATA chunk * with the earliest outstanding TSN * for that address, restart T3-rtx * timer for that address with its * current RTO. */ restart_timer = 1; forward_progress = true; list_add_tail(&tchunk->transmitted_list, &q->sacked); } else { /* RFC2960 7.2.4, sctpimpguide-05 2.8.2 * M2) Each time a SACK arrives reporting * 'Stray DATA chunk(s)' record the highest TSN * reported as newly acknowledged, call this * value 'HighestTSNinSack'. A newly * acknowledged DATA chunk is one not * previously acknowledged in a SACK. * * When the SCTP sender of data receives a SACK * chunk that acknowledges, for the first time, * the receipt of a DATA chunk, all the still * unacknowledged DATA chunks whose TSN is * older than that newly acknowledged DATA * chunk, are qualified as 'Stray DATA chunks'. */ list_add_tail(lchunk, &tlist); } } else { if (tchunk->tsn_gap_acked) { pr_debug("%s: receiver reneged on data TSN:0x%x\n", __func__, tsn); tchunk->tsn_gap_acked = 0; if (tchunk->transport) bytes_acked -= sctp_data_size(tchunk); /* RFC 2960 6.3.2 Retransmission Timer Rules * * R4) Whenever a SACK is received missing a * TSN that was previously acknowledged via a * Gap Ack Block, start T3-rtx for the * destination address to which the DATA * chunk was originally * transmitted if it is not already running. */ restart_timer = 1; } list_add_tail(lchunk, &tlist); } } if (transport) { if (bytes_acked) { struct sctp_association *asoc = transport->asoc; /* We may have counted DATA that was migrated * to this transport due to DEL-IP operation. * Subtract those bytes, since the were never * send on this transport and shouldn't be * credited to this transport. */ bytes_acked -= migrate_bytes; /* 8.2. When an outstanding TSN is acknowledged, * the endpoint shall clear the error counter of * the destination transport address to which the * DATA chunk was last sent. * The association's overall error counter is * also cleared. */ transport->error_count = 0; transport->asoc->overall_error_count = 0; forward_progress = true; /* * While in SHUTDOWN PENDING, we may have started * the T5 shutdown guard timer after reaching the * retransmission limit. Stop that timer as soon * as the receiver acknowledged any data. */ if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING && del_timer(&asoc->timers [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD])) sctp_association_put(asoc); /* Mark the destination transport address as * active if it is not so marked. */ if ((transport->state == SCTP_INACTIVE || transport->state == SCTP_UNCONFIRMED) && sctp_cmp_addr_exact(&transport->ipaddr, saddr)) { sctp_assoc_control_transport( transport->asoc, transport, SCTP_TRANSPORT_UP, SCTP_RECEIVED_SACK); } sctp_transport_raise_cwnd(transport, sack_ctsn, bytes_acked); transport->flight_size -= bytes_acked; if (transport->flight_size == 0) transport->partial_bytes_acked = 0; q->outstanding_bytes -= bytes_acked + migrate_bytes; } else { /* RFC 2960 6.1, sctpimpguide-06 2.15.2 * When a sender is doing zero window probing, it * should not timeout the association if it continues * to receive new packets from the receiver. The * reason is that the receiver MAY keep its window * closed for an indefinite time. * A sender is doing zero window probing when the * receiver's advertised window is zero, and there is * only one data chunk in flight to the receiver. * * Allow the association to timeout while in SHUTDOWN * PENDING or SHUTDOWN RECEIVED in case the receiver * stays in zero window mode forever. */ if (!q->asoc->peer.rwnd && !list_empty(&tlist) && (sack_ctsn+2 == q->asoc->next_tsn) && q->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) { pr_debug("%s: sack received for zero window " "probe:%u\n", __func__, sack_ctsn); q->asoc->overall_error_count = 0; transport->error_count = 0; } } /* RFC 2960 6.3.2 Retransmission Timer Rules * * R2) Whenever all outstanding data sent to an address have * been acknowledged, turn off the T3-rtx timer of that * address. */ if (!transport->flight_size) { if (del_timer(&transport->T3_rtx_timer)) sctp_transport_put(transport); } else if (restart_timer) { if (!mod_timer(&transport->T3_rtx_timer, jiffies + transport->rto)) sctp_transport_hold(transport); } if (forward_progress) { if (transport->dst) sctp_transport_dst_confirm(transport); } } list_splice(&tlist, transmitted_queue); } /* Mark chunks as missing and consequently may get retransmitted. */ static void sctp_mark_missing(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, __u32 highest_new_tsn_in_sack, int count_of_newacks) { struct sctp_chunk *chunk; __u32 tsn; char do_fast_retransmit = 0; struct sctp_association *asoc = q->asoc; struct sctp_transport *primary = asoc->peer.primary_path; list_for_each_entry(chunk, transmitted_queue, transmitted_list) { tsn = ntohl(chunk->subh.data_hdr->tsn); /* RFC 2960 7.2.4, sctpimpguide-05 2.8.2 M3) Examine all * 'Unacknowledged TSN's', if the TSN number of an * 'Unacknowledged TSN' is smaller than the 'HighestTSNinSack' * value, increment the 'TSN.Missing.Report' count on that * chunk if it has NOT been fast retransmitted or marked for * fast retransmit already. */ if (chunk->fast_retransmit == SCTP_CAN_FRTX && !chunk->tsn_gap_acked && TSN_lt(tsn, highest_new_tsn_in_sack)) { /* SFR-CACC may require us to skip marking * this chunk as missing. */ if (!transport || !sctp_cacc_skip(primary, chunk->transport, count_of_newacks, tsn)) { chunk->tsn_missing_report++; pr_debug("%s: tsn:0x%x missing counter:%d\n", __func__, tsn, chunk->tsn_missing_report); } } /* * M4) If any DATA chunk is found to have a * 'TSN.Missing.Report' * value larger than or equal to 3, mark that chunk for * retransmission and start the fast retransmit procedure. */ if (chunk->tsn_missing_report >= 3) { chunk->fast_retransmit = SCTP_NEED_FRTX; do_fast_retransmit = 1; } } if (transport) { if (do_fast_retransmit) sctp_retransmit(q, transport, SCTP_RTXR_FAST_RTX); pr_debug("%s: transport:%p, cwnd:%d, ssthresh:%d, " "flight_size:%d, pba:%d\n", __func__, transport, transport->cwnd, transport->ssthresh, transport->flight_size, transport->partial_bytes_acked); } } /* Is the given TSN acked by this packet? */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) { __u32 ctsn = ntohl(sack->cum_tsn_ack); union sctp_sack_variable *frags; __u16 tsn_offset, blocks; int i; if (TSN_lte(tsn, ctsn)) goto pass; /* 3.3.4 Selective Acknowledgment (SACK) (3): * * Gap Ack Blocks: * These fields contain the Gap Ack Blocks. They are repeated * for each Gap Ack Block up to the number of Gap Ack Blocks * defined in the Number of Gap Ack Blocks field. All DATA * chunks with TSNs greater than or equal to (Cumulative TSN * Ack + Gap Ack Block Start) and less than or equal to * (Cumulative TSN Ack + Gap Ack Block End) of each Gap Ack * Block are assumed to have been received correctly. */ frags = (union sctp_sack_variable *)(sack + 1); blocks = ntohs(sack->num_gap_ack_blocks); tsn_offset = tsn - ctsn; for (i = 0; i < blocks; ++i) { if (tsn_offset >= ntohs(frags[i].gab.start) && tsn_offset <= ntohs(frags[i].gab.end)) goto pass; } return 0; pass: return 1; } static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist, int nskips, __be16 stream) { int i; for (i = 0; i < nskips; i++) { if (skiplist[i].stream == stream) return i; } return i; } /* Create and add a fwdtsn chunk to the outq's control queue if needed. */ void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; struct sctp_fwdtsn_skip ftsn_skip_arr[10]; int nskips = 0; int skip_pos = 0; __u32 tsn; struct sctp_chunk *chunk; struct list_head *lchunk, *temp; if (!asoc->peer.prsctp_capable) return; /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the * received SACK. * * If (Advanced.Peer.Ack.Point < SackCumAck), then update * Advanced.Peer.Ack.Point to be equal to SackCumAck. */ if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) asoc->adv_peer_ack_point = ctsn; /* PR-SCTP C2) Try to further advance the "Advanced.Peer.Ack.Point" * locally, that is, to move "Advanced.Peer.Ack.Point" up as long as * the chunk next in the out-queue space is marked as "abandoned" as * shown in the following example: * * Assuming that a SACK arrived with the Cumulative TSN ACK 102 * and the Advanced.Peer.Ack.Point is updated to this value: * * out-queue at the end of ==> out-queue after Adv.Ack.Point * normal SACK processing local advancement * ... ... * Adv.Ack.Pt-> 102 acked 102 acked * 103 abandoned 103 abandoned * 104 abandoned Adv.Ack.P-> 104 abandoned * 105 105 * 106 acked 106 acked * ... ... * * In this example, the data sender successfully advanced the * "Advanced.Peer.Ack.Point" from 102 to 104 locally. */ list_for_each_safe(lchunk, temp, &q->abandoned) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(chunk->subh.data_hdr->tsn); /* Remove any chunks in the abandoned queue that are acked by * the ctsn. */ if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); sctp_chunk_free(chunk); } else { if (TSN_lte(tsn, asoc->adv_peer_ack_point+1)) { asoc->adv_peer_ack_point = tsn; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) continue; skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, chunk->subh.data_hdr->stream); ftsn_skip_arr[skip_pos].stream = chunk->subh.data_hdr->stream; ftsn_skip_arr[skip_pos].ssn = chunk->subh.data_hdr->ssn; if (skip_pos == nskips) nskips++; if (nskips == 10) break; } else break; } } /* PR-SCTP C3) If, after step C1 and C2, the "Advanced.Peer.Ack.Point" * is greater than the Cumulative TSN ACK carried in the received * SACK, the data sender MUST send the data receiver a FORWARD TSN * chunk containing the latest value of the * "Advanced.Peer.Ack.Point". * * C4) For each "abandoned" TSN the sender of the FORWARD TSN SHOULD * list each stream and sequence number in the forwarded TSN. This * information will enable the receiver to easily find any * stranded TSN's waiting on stream reorder queues. Each stream * SHOULD only be reported once; this means that if multiple * abandoned messages occur in the same stream then only the * highest abandoned stream sequence number is reported. If the * total size of the FORWARD TSN does NOT fit in a single MTU then * the sender of the FORWARD TSN SHOULD lower the * Advanced.Peer.Ack.Point to the last TSN that will fit in a * single MTU. */ if (asoc->adv_peer_ack_point > ctsn) ftsn_chunk = sctp_make_fwdtsn(asoc, asoc->adv_peer_ack_point, nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } }
2979 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/dsa.h - Driver for Distributed Switch Architecture switch chips * Copyright (c) 2008-2009 Marvell Semiconductor */ #ifndef __LINUX_NET_DSA_H #define __LINUX_NET_DSA_H #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/of.h> #include <linux/ethtool.h> #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/platform_data/dsa.h> #include <linux/phylink.h> #include <net/devlink.h> #include <net/switchdev.h> struct dsa_8021q_context; struct tc_action; #define DSA_TAG_PROTO_NONE_VALUE 0 #define DSA_TAG_PROTO_BRCM_VALUE 1 #define DSA_TAG_PROTO_BRCM_PREPEND_VALUE 2 #define DSA_TAG_PROTO_DSA_VALUE 3 #define DSA_TAG_PROTO_EDSA_VALUE 4 #define DSA_TAG_PROTO_GSWIP_VALUE 5 #define DSA_TAG_PROTO_KSZ9477_VALUE 6 #define DSA_TAG_PROTO_KSZ9893_VALUE 7 #define DSA_TAG_PROTO_LAN9303_VALUE 8 #define DSA_TAG_PROTO_MTK_VALUE 9 #define DSA_TAG_PROTO_QCA_VALUE 10 #define DSA_TAG_PROTO_TRAILER_VALUE 11 #define DSA_TAG_PROTO_8021Q_VALUE 12 #define DSA_TAG_PROTO_SJA1105_VALUE 13 #define DSA_TAG_PROTO_KSZ8795_VALUE 14 #define DSA_TAG_PROTO_OCELOT_VALUE 15 #define DSA_TAG_PROTO_AR9331_VALUE 16 #define DSA_TAG_PROTO_RTL4_A_VALUE 17 #define DSA_TAG_PROTO_HELLCREEK_VALUE 18 #define DSA_TAG_PROTO_XRS700X_VALUE 19 #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 #define DSA_TAG_PROTO_SEVILLE_VALUE 21 #define DSA_TAG_PROTO_BRCM_LEGACY_VALUE 22 #define DSA_TAG_PROTO_SJA1110_VALUE 23 #define DSA_TAG_PROTO_RTL8_4_VALUE 24 #define DSA_TAG_PROTO_RTL8_4T_VALUE 25 #define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26 #define DSA_TAG_PROTO_LAN937X_VALUE 27 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, DSA_TAG_PROTO_GSWIP = DSA_TAG_PROTO_GSWIP_VALUE, DSA_TAG_PROTO_KSZ9477 = DSA_TAG_PROTO_KSZ9477_VALUE, DSA_TAG_PROTO_KSZ9893 = DSA_TAG_PROTO_KSZ9893_VALUE, DSA_TAG_PROTO_LAN9303 = DSA_TAG_PROTO_LAN9303_VALUE, DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, DSA_TAG_PROTO_KSZ8795 = DSA_TAG_PROTO_KSZ8795_VALUE, DSA_TAG_PROTO_OCELOT = DSA_TAG_PROTO_OCELOT_VALUE, DSA_TAG_PROTO_AR9331 = DSA_TAG_PROTO_AR9331_VALUE, DSA_TAG_PROTO_RTL4_A = DSA_TAG_PROTO_RTL4_A_VALUE, DSA_TAG_PROTO_HELLCREEK = DSA_TAG_PROTO_HELLCREEK_VALUE, DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, DSA_TAG_PROTO_OCELOT_8021Q = DSA_TAG_PROTO_OCELOT_8021Q_VALUE, DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, DSA_TAG_PROTO_SJA1110 = DSA_TAG_PROTO_SJA1110_VALUE, DSA_TAG_PROTO_RTL8_4 = DSA_TAG_PROTO_RTL8_4_VALUE, DSA_TAG_PROTO_RTL8_4T = DSA_TAG_PROTO_RTL8_4T_VALUE, DSA_TAG_PROTO_RZN1_A5PSW = DSA_TAG_PROTO_RZN1_A5PSW_VALUE, DSA_TAG_PROTO_LAN937X = DSA_TAG_PROTO_LAN937X_VALUE, DSA_TAG_PROTO_VSC73XX_8021Q = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE, }; struct dsa_switch; struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); int (*connect)(struct dsa_switch *ds); void (*disconnect)(struct dsa_switch *ds); unsigned int needed_headroom; unsigned int needed_tailroom; const char *name; enum dsa_tag_protocol proto; /* Some tagging protocols either mangle or shift the destination MAC * address, in which case the DSA conduit would drop packets on ingress * if what it understands out of the destination MAC address is not in * its RX filter. */ bool promisc_on_conduit; }; struct dsa_lag { struct net_device *dev; unsigned int id; struct mutex fdb_lock; struct list_head fdbs; refcount_t refcount; }; struct dsa_switch_tree { struct list_head list; /* List of switch ports */ struct list_head ports; /* Notifier chain for switch-wide events */ struct raw_notifier_head nh; /* Tree identifier */ unsigned int index; /* Number of switches attached to this tree */ struct kref refcount; /* Maps offloaded LAG netdevs to a zero-based linear ID for * drivers that need it. */ struct dsa_lag **lags; /* Tagging protocol operations */ const struct dsa_device_ops *tag_ops; /* Default tagging protocol preferred by the switches in this * tree. */ enum dsa_tag_protocol default_proto; /* Has this tree been applied to the hardware? */ bool setup; /* * Configuration data for the platform device that owns * this dsa switch tree instance. */ struct dsa_platform_data *pd; /* List of DSA links composing the routing table */ struct list_head rtable; /* Length of "lags" array */ unsigned int lags_len; /* Track the largest switch index within a tree */ unsigned int last_switch; }; /* LAG IDs are one-based, the dst->lags array is zero-based */ #define dsa_lags_foreach_id(_id, _dst) \ for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++) \ if ((_dst)->lags[(_id) - 1]) #define dsa_lag_foreach_port(_dp, _dst, _lag) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_offloads_lag((_dp), (_lag))) #define dsa_hsr_foreach_port(_dp, _ds, _hsr) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr)) static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst, unsigned int id) { /* DSA LAG IDs are one-based, dst->lags is zero-based */ return dst->lags[id - 1]; } static inline int dsa_lag_id(struct dsa_switch_tree *dst, struct net_device *lag_dev) { unsigned int id; dsa_lags_foreach_id(id, dst) { struct dsa_lag *lag = dsa_lag_by_id(dst, id); if (lag->dev == lag_dev) return lag->id; } return -ENODEV; } /* TC matchall action types */ enum dsa_port_mall_action_type { DSA_PORT_MALL_MIRROR, DSA_PORT_MALL_POLICER, }; /* TC mirroring entry */ struct dsa_mall_mirror_tc_entry { u8 to_local_port; bool ingress; }; /* TC port policer entry */ struct dsa_mall_policer_tc_entry { u32 burst; u64 rate_bytes_per_sec; }; /* TC matchall entry */ struct dsa_mall_tc_entry { struct list_head list; unsigned long cookie; enum dsa_port_mall_action_type type; union { struct dsa_mall_mirror_tc_entry mirror; struct dsa_mall_policer_tc_entry policer; }; }; struct dsa_bridge { struct net_device *dev; unsigned int num; bool tx_fwd_offload; refcount_t refcount; }; struct dsa_port { /* A CPU port is physically connected to a conduit device. A user port * exposes a network device to user-space, called 'user' here. */ union { struct net_device *conduit; struct net_device *user; }; /* Copy of the tagging protocol operations, for quicker access * in the data path. Valid only for the CPU ports. */ const struct dsa_device_ops *tag_ops; /* Copies for faster access in conduit receive hot path */ struct dsa_switch_tree *dst; struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); struct dsa_switch *ds; unsigned int index; enum { DSA_PORT_TYPE_UNUSED = 0, DSA_PORT_TYPE_CPU, DSA_PORT_TYPE_DSA, DSA_PORT_TYPE_USER, } type; const char *name; struct dsa_port *cpu_dp; u8 mac[ETH_ALEN]; u8 stp_state; /* Warning: the following bit fields are not atomic, and updating them * can only be done from code paths where concurrency is not possible * (probe time or under rtnl_lock). */ u8 vlan_filtering:1; /* Managed by DSA on user ports and by drivers on CPU and DSA ports */ u8 learning:1; u8 lag_tx_enabled:1; /* conduit state bits, valid only on CPU ports */ u8 conduit_admin_up:1; u8 conduit_oper_up:1; /* Valid only on user ports */ u8 cpu_port_in_lag:1; u8 setup:1; struct device_node *dn; unsigned int ageing_time; struct dsa_bridge *bridge; struct devlink_port devlink_port; struct phylink *pl; struct phylink_config pl_config; struct dsa_lag *lag; struct net_device *hsr_dev; struct list_head list; /* * Original copy of the conduit netdev ethtool_ops */ const struct ethtool_ops *orig_ethtool_ops; /* List of MAC addresses that must be forwarded on this port. * These are only valid on CPU ports and DSA links. */ struct mutex addr_lists_lock; struct list_head fdbs; struct list_head mdbs; struct mutex vlans_lock; union { /* List of VLANs that CPU and DSA ports are members of. * Access to this is serialized by the sleepable @vlans_lock. */ struct list_head vlans; /* List of VLANs that user ports are members of. * Access to this is serialized by netif_addr_lock_bh(). */ struct list_head user_vlans; }; }; static inline struct dsa_port * dsa_phylink_to_port(struct phylink_config *config) { return container_of(config, struct dsa_port, pl_config); } /* TODO: ideally DSA ports would have a single dp->link_dp member, * and no dst->rtable nor this struct dsa_link would be needed, * but this would require some more complex tree walking, * so keep it stupid at the moment and list them all. */ struct dsa_link { struct dsa_port *dp; struct dsa_port *link_dp; struct list_head list; }; enum dsa_db_type { DSA_DB_PORT, DSA_DB_LAG, DSA_DB_BRIDGE, }; struct dsa_db { enum dsa_db_type type; union { const struct dsa_port *dp; struct dsa_lag lag; struct dsa_bridge bridge; }; }; struct dsa_mac_addr { unsigned char addr[ETH_ALEN]; u16 vid; refcount_t refcount; struct list_head list; struct dsa_db db; }; struct dsa_vlan { u16 vid; refcount_t refcount; struct list_head list; }; struct dsa_switch { struct device *dev; /* * Parent switch tree, and switch index. */ struct dsa_switch_tree *dst; unsigned int index; /* Warning: the following bit fields are not atomic, and updating them * can only be done from code paths where concurrency is not possible * (probe time or under rtnl_lock). */ u32 setup:1; /* Disallow bridge core from requesting different VLAN awareness * settings on ports if not hardware-supported */ u32 vlan_filtering_is_global:1; /* Keep VLAN filtering enabled on ports not offloading any upper */ u32 needs_standalone_vlan_filtering:1; /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges * that have vlan_filtering=0. All drivers should ideally set this (and * then the option would get removed), but it is unknown whether this * would break things or not. */ u32 configure_vlan_while_not_filtering:1; /* Pop the default_pvid of VLAN-unaware bridge ports from tagged frames. * DEPRECATED: Do NOT set this field in new drivers. Instead look at * the dsa_software_vlan_untag() comments. */ u32 untag_bridge_pvid:1; /* Pop the default_pvid of VLAN-aware bridge ports from tagged frames. * Useful if the switch cannot preserve the VLAN tag as seen on the * wire for user port ingress, and chooses to send all frames as * VLAN-tagged to the CPU, including those which were originally * untagged. */ u32 untag_vlan_aware_bridge_pvid:1; /* Let DSA manage the FDB entries towards the * CPU, based on the software bridge database. */ u32 assisted_learning_on_cpu_port:1; /* In case vlan_filtering_is_global is set, the VLAN awareness state * should be retrieved from here and not from the per-port settings. */ u32 vlan_filtering:1; /* For switches that only have the MRU configurable. To ensure the * configured MTU is not exceeded, normalization of MRU on all bridged * interfaces is needed. */ u32 mtu_enforcement_ingress:1; /* Drivers that isolate the FDBs of multiple bridges must set this * to true to receive the bridge as an argument in .port_fdb_{add,del} * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be * passed as zero. */ u32 fdb_isolation:1; /* Drivers that have global DSCP mapping settings must set this to * true to automatically apply the settings to all ports. */ u32 dscp_prio_mapping_is_global:1; /* Listener for switch fabric events */ struct notifier_block nb; /* * Give the switch driver somewhere to hang its private data * structure. */ void *priv; void *tagger_data; /* * Configuration data for this switch. */ struct dsa_chip_data *cd; /* * The switch operations. */ const struct dsa_switch_ops *ops; /* * Allow a DSA switch driver to override the phylink MAC ops */ const struct phylink_mac_ops *phylink_mac_ops; /* * User mii_bus and devices for the individual ports. */ u32 phys_mii_mask; struct mii_bus *user_mii_bus; /* Ageing Time limits in msecs */ unsigned int ageing_time_min; unsigned int ageing_time_max; /* Storage for drivers using tag_8021q */ struct dsa_8021q_context *tag_8021q_ctx; /* devlink used to represent this switch device */ struct devlink *devlink; /* Number of switch port queues */ unsigned int num_tx_queues; /* Drivers that benefit from having an ID associated with each * offloaded LAG should set this to the maximum number of * supported IDs. DSA will then maintain a mapping of _at * least_ these many IDs, accessible to drivers via * dsa_lag_id(). */ unsigned int num_lag_ids; /* Drivers that support bridge forwarding offload or FDB isolation * should set this to the maximum number of bridges spanning the same * switch tree (or all trees, in the case of cross-tree bridging * support) that can be offloaded. */ unsigned int max_num_bridges; unsigned int num_ports; }; static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { struct dsa_switch_tree *dst = ds->dst; struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dp->ds == ds && dp->index == p) return dp; return NULL; } static inline bool dsa_port_is_dsa(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_port_is_cpu(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_port_is_user(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_USER; } static inline bool dsa_port_is_unused(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp) { return dsa_port_is_cpu(dp) && dp->conduit_admin_up && dp->conduit_oper_up; } static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER; } #define dsa_tree_for_each_user_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) #define dsa_tree_for_each_user_port_continue_reverse(_dp, _dst) \ list_for_each_entry_continue_reverse((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) #define dsa_tree_for_each_cpu_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_cpu((_dp))) #define dsa_switch_for_each_port(_dp, _ds) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_safe(_dp, _next, _ds) \ list_for_each_entry_safe((_dp), (_next), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_continue_reverse(_dp, _ds) \ list_for_each_entry_continue_reverse((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_available_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (!dsa_port_is_unused((_dp))) #define dsa_switch_for_each_user_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \ dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_cpu_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) #define dsa_switch_for_each_cpu_port_continue_reverse(_dp, _ds) \ dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) static inline u32 dsa_user_ports(struct dsa_switch *ds) { struct dsa_port *dp; u32 mask = 0; dsa_switch_for_each_user_port(dp, ds) mask |= BIT(dp->index); return mask; } static inline u32 dsa_cpu_ports(struct dsa_switch *ds) { struct dsa_port *cpu_dp; u32 mask = 0; dsa_switch_for_each_cpu_port(cpu_dp, ds) mask |= BIT(cpu_dp->index); return mask; } /* Return the local port used to reach an arbitrary switch device */ static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device) { struct dsa_switch_tree *dst = ds->dst; struct dsa_link *dl; list_for_each_entry(dl, &dst->rtable, list) if (dl->dp->ds == ds && dl->link_dp->ds->index == device) return dl->dp->index; return ds->num_ports; } /* Return the local port used to reach an arbitrary switch port */ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, int port) { if (device == ds->index) return port; else return dsa_routing_port(ds, device); } /* Return the local port used to reach the dedicated CPU port */ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port) { const struct dsa_port *dp = dsa_to_port(ds, port); const struct dsa_port *cpu_dp = dp->cpu_dp; if (!cpu_dp) return port; return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index); } /* Return true if this is the local port used to reach the CPU port */ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port) { if (dsa_is_unused_port(ds, port)) return false; return port == dsa_upstream_port(ds, port); } /* Return true if this is a DSA port leading away from the CPU */ static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port) { return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port); } /* Return the local port used to reach the CPU port */ static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds) { struct dsa_port *dp; dsa_switch_for_each_available_port(dp, ds) { return dsa_upstream_port(ds, dp->index); } return ds->num_ports; } /* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning * that the routing port from @downstream_ds to @upstream_ds is also the port * which @downstream_ds uses to reach its dedicated CPU. */ static inline bool dsa_switch_is_upstream_of(struct dsa_switch *upstream_ds, struct dsa_switch *downstream_ds) { int routing_port; if (upstream_ds == downstream_ds) return true; routing_port = dsa_routing_port(downstream_ds, upstream_ds->index); return dsa_is_upstream_port(downstream_ds, routing_port); } static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) { const struct dsa_switch *ds = dp->ds; if (ds->vlan_filtering_is_global) return ds->vlan_filtering; else return dp->vlan_filtering; } static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp) { return dp->lag ? dp->lag->id : 0; } static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp) { return dp->lag ? dp->lag->dev : NULL; } static inline bool dsa_port_offloads_lag(struct dsa_port *dp, const struct dsa_lag *lag) { return dsa_port_lag_dev_get(dp) == lag->dev; } static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp) { if (dp->cpu_port_in_lag) return dsa_port_lag_dev_get(dp->cpu_dp); return dp->cpu_dp->conduit; } static inline struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) { if (!dp->bridge) return NULL; if (dp->lag) return dp->lag->dev; else if (dp->hsr_dev) return dp->hsr_dev; return dp->user; } static inline struct net_device * dsa_port_bridge_dev_get(const struct dsa_port *dp) { return dp->bridge ? dp->bridge->dev : NULL; } static inline unsigned int dsa_port_bridge_num_get(struct dsa_port *dp) { return dp->bridge ? dp->bridge->num : 0; } static inline bool dsa_port_bridge_same(const struct dsa_port *a, const struct dsa_port *b) { struct net_device *br_a = dsa_port_bridge_dev_get(a); struct net_device *br_b = dsa_port_bridge_dev_get(b); /* Standalone ports are not in the same bridge with one another */ return (!br_a || !br_b) ? false : (br_a == br_b); } static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, const struct net_device *dev) { return dsa_port_to_bridge_port(dp) == dev; } static inline bool dsa_port_offloads_bridge_dev(struct dsa_port *dp, const struct net_device *bridge_dev) { /* DSA ports connected to a bridge, and event was emitted * for the bridge. */ return dsa_port_bridge_dev_get(dp) == bridge_dev; } static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, const struct dsa_bridge *bridge) { return dsa_port_bridge_dev_get(dp) == bridge->dev; } /* Returns true if any port of this tree offloads the given net_device */ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, const struct net_device *dev) { struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dsa_port_offloads_bridge_port(dp, dev)) return true; return false; } /* Returns true if any port of this tree offloads the given bridge */ static inline bool dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst, const struct net_device *bridge_dev) { struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dsa_port_offloads_bridge_dev(dp, bridge_dev)) return true; return false; } static inline bool dsa_port_tree_same(const struct dsa_port *a, const struct dsa_port *b) { return a->ds->dst == b->ds->dst; } typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { /* * Tagging protocol helpers called for the CPU ports and DSA links. * @get_tag_protocol retrieves the initial tagging protocol and is * mandatory. Switches which can operate using multiple tagging * protocols should implement @change_tag_protocol and report in * @get_tag_protocol the tagger in current use. */ enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port, enum dsa_tag_protocol mprot); int (*change_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); /* * Method for switch drivers to connect to the tagging protocol driver * in current use. The switch driver can provide handlers for certain * types of packets for switch management. */ int (*connect_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); int (*port_change_conduit)(struct dsa_switch *ds, int port, struct net_device *conduit, struct netlink_ext_ack *extack); /* Optional switch-wide initialization and destruction methods */ int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); /* Per-port initialization and destruction methods. Mandatory if the * driver registers devlink port regions, optional otherwise. */ int (*port_setup)(struct dsa_switch *ds, int port); void (*port_teardown)(struct dsa_switch *ds, int port); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* * Access to the switch's PHY registers. */ int (*phy_read)(struct dsa_switch *ds, int port, int regnum); int (*phy_write)(struct dsa_switch *ds, int port, int regnum, u16 val); /* * PHYLINK integration */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, struct phylink_config *config); void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* * Port statistics counters. */ void (*get_strings)(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data); void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds, int port, int sset); void (*get_ethtool_phy_stats)(struct dsa_switch *ds, int port, uint64_t *data); void (*get_eth_phy_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_phy_stats *phy_stats); void (*get_eth_mac_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_mac_stats *mac_stats); void (*get_eth_ctrl_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_ctrl_stats *ctrl_stats); void (*get_rmon_stats)(struct dsa_switch *ds, int port, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges); void (*get_stats64)(struct dsa_switch *ds, int port, struct rtnl_link_stats64 *s); void (*get_pause_stats)(struct dsa_switch *ds, int port, struct ethtool_pause_stats *pause_stats); void (*self_test)(struct dsa_switch *ds, int port, struct ethtool_test *etest, u64 *data); /* * ethtool Wake-on-LAN */ void (*get_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); int (*set_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); /* * ethtool timestamp info */ int (*get_ts_info)(struct dsa_switch *ds, int port, struct kernel_ethtool_ts_info *ts); /* * ethtool MAC merge layer */ int (*get_mm)(struct dsa_switch *ds, int port, struct ethtool_mm_state *state); int (*set_mm)(struct dsa_switch *ds, int port, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack); void (*get_mm_stats)(struct dsa_switch *ds, int port, struct ethtool_mm_stats *stats); /* * DCB ops */ int (*port_get_default_prio)(struct dsa_switch *ds, int port); int (*port_set_default_prio)(struct dsa_switch *ds, int port, u8 prio); int (*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp); int (*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); int (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); int (*port_set_apptrust)(struct dsa_switch *ds, int port, const u8 *sel, int nsel); int (*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel, int *nsel); /* * Suspend and resume */ int (*suspend)(struct dsa_switch *ds); int (*resume)(struct dsa_switch *ds); /* * Port enable/disable */ int (*port_enable)(struct dsa_switch *ds, int port, struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port); /* * Notification for MAC address changes on user ports. Drivers can * currently only veto operations. They should not use the method to * program the hardware, since the operation is not rolled back in case * of other errors. */ int (*port_set_mac_address)(struct dsa_switch *ds, int port, const unsigned char *addr); /* * Compatibility between device trees defining multiple CPU ports and * drivers which are not OK to use by default the numerically smallest * CPU port of a switch for its local ports. This can return NULL, * meaning "don't know/don't care". */ struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds); /* * Port's MAC EEE settings */ int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_keee *e); int (*get_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_keee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); int (*get_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); int (*set_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); /* * Register access. */ int (*get_regs_len)(struct dsa_switch *ds, int port); void (*get_regs)(struct dsa_switch *ds, int port, struct ethtool_regs *regs, void *p); /* * Upper device tracking. */ int (*port_prechangeupper)(struct dsa_switch *ds, int port, struct netdev_notifier_changeupper_info *info); /* * Bridge integration */ int (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs); int (*port_bridge_join)(struct dsa_switch *ds, int port, struct dsa_bridge bridge, bool *tx_fwd_offload, struct netlink_ext_ack *extack); void (*port_bridge_leave)(struct dsa_switch *ds, int port, struct dsa_bridge bridge); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); int (*port_mst_state_set)(struct dsa_switch *ds, int port, const struct switchdev_mst_state *state); void (*port_fast_age)(struct dsa_switch *ds, int port); int (*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid); int (*port_pre_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); int (*port_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); void (*port_set_host_flood)(struct dsa_switch *ds, int port, bool uc, bool mc); /* * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, bool vlan_filtering, struct netlink_ext_ack *extack); int (*port_vlan_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); int (*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge, const struct switchdev_vlan_msti *msti); /* * Forwarding database */ int (*port_fdb_add)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); int (*port_fdb_del)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); int (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag, const unsigned char *addr, u16 vid, struct dsa_db db); int (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag, const unsigned char *addr, u16 vid, struct dsa_db db); /* * Multicast database */ int (*port_mdb_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); int (*port_mdb_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); /* * RXNFC */ int (*get_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc, u32 *rule_locs); int (*set_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc); /* * TC integration */ int (*cls_flower_add)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_del)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_stats)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*port_mirror_add)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress, struct netlink_ext_ack *extack); void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int (*port_policer_add)(struct dsa_switch *ds, int port, struct dsa_mall_policer_tc_entry *policer); void (*port_policer_del)(struct dsa_switch *ds, int port); int (*port_setup_tc)(struct dsa_switch *ds, int port, enum tc_setup_type type, void *type_data); /* * Cross-chip operations */ int (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge, struct netlink_ext_ack *extack); void (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge); int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, int port); int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag, struct netdev_lag_upper_info *info, struct netlink_ext_ack *extack); int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag); /* * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, struct ifreq *ifr); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, struct ifreq *ifr); void (*port_txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); /* Devlink parameters, etc */ int (*devlink_param_get)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_param_set)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_info_get)(struct dsa_switch *ds, struct devlink_info_req *req, struct netlink_ext_ack *extack); int (*devlink_sb_pool_get)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, struct devlink_sb_pool_info *pool_info); int (*devlink_sb_pool_set)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, u32 size, enum devlink_sb_threshold_type threshold_type, struct netlink_ext_ack *extack); int (*devlink_sb_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_threshold); int (*devlink_sb_port_pool_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_tc_pool_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 *p_pool_index, u32 *p_threshold); int (*devlink_sb_tc_pool_bind_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_occ_snapshot)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_max_clear)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_cur, u32 *p_max); int (*devlink_sb_occ_tc_port_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u32 *p_cur, u32 *p_max); /* * MTU change functionality. Switches can also adjust their MRU through * this method. By MTU, one understands the SDU (L2 payload) length. * If the switch needs to account for the DSA tag on the CPU port, this * method needs to do so privately. */ int (*port_change_mtu)(struct dsa_switch *ds, int port, int new_mtu); int (*port_max_mtu)(struct dsa_switch *ds, int port); /* * LAG integration */ int (*port_lag_change)(struct dsa_switch *ds, int port); int (*port_lag_join)(struct dsa_switch *ds, int port, struct dsa_lag lag, struct netdev_lag_upper_info *info, struct netlink_ext_ack *extack); int (*port_lag_leave)(struct dsa_switch *ds, int port, struct dsa_lag lag); /* * HSR integration */ int (*port_hsr_join)(struct dsa_switch *ds, int port, struct net_device *hsr, struct netlink_ext_ack *extack); int (*port_hsr_leave)(struct dsa_switch *ds, int port, struct net_device *hsr); /* * MRP integration */ int (*port_mrp_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); /* * tag_8021q operations */ int (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags); int (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid); /* * DSA conduit tracking operations */ void (*conduit_state_change)(struct dsa_switch *ds, const struct net_device *conduit, bool operational); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes, \ dsa_devlink_param_get, dsa_devlink_param_set, NULL) int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx); int dsa_devlink_param_set(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack); int dsa_devlink_params_register(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); void dsa_devlink_params_unregister(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); int dsa_devlink_resource_register(struct dsa_switch *ds, const char *resource_name, u64 resource_size, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params); void dsa_devlink_resources_unregister(struct dsa_switch *ds); void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv); void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, u64 resource_id); struct devlink_region * dsa_devlink_region_create(struct dsa_switch *ds, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size); struct devlink_region * dsa_devlink_port_region_create(struct dsa_switch *ds, int port, const struct devlink_port_region_ops *ops, u32 region_max_snapshots, u64 region_size); void dsa_devlink_region_destroy(struct devlink_region *region); struct dsa_port *dsa_port_from_netdev(struct net_device *netdev); struct dsa_devlink_priv { struct dsa_switch *ds; }; static inline struct dsa_switch *dsa_devlink_to_ds(struct devlink *dl) { struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline struct dsa_switch *dsa_devlink_port_to_ds(struct devlink_port *port) { struct devlink *dl = port->devlink; struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline int dsa_devlink_port_to_port(struct devlink_port *port) { return port->index; } struct dsa_switch_driver { struct list_head list; const struct dsa_switch_ops *ops; }; bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) { #if IS_ENABLED(CONFIG_NET_DSA) return dev->dsa_ptr && dev->dsa_ptr->rcv; #endif return false; } /* All DSA tags that push the EtherType to the right (basically all except tail * tags, which don't break dissection) can be treated the same from the * perspective of the flow dissector. * * We need to return: * - offset: the (B - A) difference between: * A. the position of the real EtherType and * B. the current skb->data (aka ETH_HLEN bytes into the frame, aka 2 bytes * after the normal EtherType was supposed to be) * The offset in bytes is exactly equal to the tagger overhead (and half of * that, in __be16 shorts). * * - proto: the value of the real EtherType. */ static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb, __be16 *proto, int *offset) { #if IS_ENABLED(CONFIG_NET_DSA) const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops; int tag_len = ops->needed_headroom; *offset = tag_len; *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1]; #endif } void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); void dsa_flush_workqueue(void); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); #else static inline int dsa_switch_suspend(struct dsa_switch *ds) { return 0; } static inline int dsa_switch_resume(struct dsa_switch *ds) { return 0; } #endif /* CONFIG_PM_SLEEP */ #if IS_ENABLED(CONFIG_NET_DSA) bool dsa_user_dev_check(const struct net_device *dev); #else static inline bool dsa_user_dev_check(const struct net_device *dev) { return false; } #endif netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); #endif
296 278 9 57 121 18 488 6 188 188 172 41 41 1 1 6 6 6 17 114 370 61 110 10 1 1 2 1 575 23 575 124 52 414 24 390 266 265 230 126 56 1 8 2 162 235 27 110 37 425 41 412 2 62 338 390 51 337 574 278 296 299 7 288 244 132 114 63 51 26 31 20 38 44 31 6 35 2 27 337 295 272 407 179 182 487 483 485 482 482 487 487 482 485 487 488 2 2 2329 2324 244 122 28 47 54 15 121 8 32 314 64 60 293 30 21 210 315 147 119 119 21 212 211 108 108 22 17 100 100 99 42 44 12 2 113 51 11 1 10 4 4 22 22 11 4 6 12 77 1 1 1 1 1 1 71 1 15 15 9 3 6 11 4 7 9 3 6 199 1 1 1 34 38 23 23 154 4 139 139 2 109 64 57 58 15 57 21 1 111 55 55 3 41 38 41 23 23 6 35 60 60 2 46 23 46 12 11 6 41 11 42 6 1 43 90 42 1 1 1 39 34 26 4 1 2 1 71 1 1 1 1 1 1 1 64 61 62 4 2 46 17 5 42 40 3 1 32 4 28 20 6 16 9 1 8 166 1 159 30 143 114 101 26 2 4 2 2 129 128 22 22 10 12 76 89 3 2 50 29 30 34 16 75 70 189 189 18 183 33 31 2 1 1 1 1 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 6 5 7 7 5 1 2 2 2 5 8 7 2 9 8 2 18 19 10 19 18 4 1 4 6 4 3 3 59 1 1 53 27 44 25 4 19 16 58 44 1 1 36 23 25 23 1 16 13 123 1 1 1 121 61 22 34 7 35 35 12 16 13 6 37 1 36 2 1 29 24 24 15 74 1 1 1 2 70 2 2 60 31 59 1 1 1 1 56 22 7 7 7 7 18 8 5 13 9 4 1 1 1 9 43 19 1 1 14 3 5 1 4 9 29 1 29 1 1 1 1 25 19 19 18 12 16 5 4 1 2 1 2 6 5 55 43 2 39 20 7 23 28 42 2 25 45 3 31 26 23 31 12 10 24 3 18 4 3 9 574 572 409 586 182 408 185 82 63 17 111 111 15 312 316 315 315 123 32 185 251 384 382 290 244 98 585 1 3 586 384 597 1 3 589 604 608 607 2 2 1 2 597 658 20 3 628 7 2 630 11 6 6 606 8 188 188 188 188 187 186 187 191 188 189 191 72 14 62 9 71 72 188 662 1 5 660 66 8 400 4 187 6 182 478 5 486 121 122 3 3 3 3 3 3 3 2 1 2 667 672 491 182 180 1 258 160 101 110 21 1 2 4 9 5 12 14 14 14 14 12 1 1 17 101 1 1 9 7 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018 Facebook */ #include <uapi/linux/btf.h> #include <uapi/linux/bpf.h> #include <uapi/linux/bpf_perf_event.h> #include <uapi/linux/types.h> #include <linux/seq_file.h> #include <linux/compiler.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/anon_inodes.h> #include <linux/file.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/idr.h> #include <linux/sort.h> #include <linux/bpf_verifier.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/bpf.h> #include <linux/bpf_lsm.h> #include <linux/skmsg.h> #include <linux/perf_event.h> #include <linux/bsearch.h> #include <linux/kobject.h> #include <linux/sysfs.h> #include <net/netfilter/nf_bpf_link.h> #include <net/sock.h> #include <net/xdp.h> #include "../tools/lib/bpf/relo_core.h" /* BTF (BPF Type Format) is the meta data format which describes * the data types of BPF program/map. Hence, it basically focus * on the C programming language which the modern BPF is primary * using. * * ELF Section: * ~~~~~~~~~~~ * The BTF data is stored under the ".BTF" ELF section * * struct btf_type: * ~~~~~~~~~~~~~~~ * Each 'struct btf_type' object describes a C data type. * Depending on the type it is describing, a 'struct btf_type' * object may be followed by more data. F.e. * To describe an array, 'struct btf_type' is followed by * 'struct btf_array'. * * 'struct btf_type' and any extra data following it are * 4 bytes aligned. * * Type section: * ~~~~~~~~~~~~~ * The BTF type section contains a list of 'struct btf_type' objects. * Each one describes a C type. Recall from the above section * that a 'struct btf_type' object could be immediately followed by extra * data in order to describe some particular C types. * * type_id: * ~~~~~~~ * Each btf_type object is identified by a type_id. The type_id * is implicitly implied by the location of the btf_type object in * the BTF type section. The first one has type_id 1. The second * one has type_id 2...etc. Hence, an earlier btf_type has * a smaller type_id. * * A btf_type object may refer to another btf_type object by using * type_id (i.e. the "type" in the "struct btf_type"). * * NOTE that we cannot assume any reference-order. * A btf_type object can refer to an earlier btf_type object * but it can also refer to a later btf_type object. * * For example, to describe "const void *". A btf_type * object describing "const" may refer to another btf_type * object describing "void *". This type-reference is done * by specifying type_id: * * [1] CONST (anon) type_id=2 * [2] PTR (anon) type_id=0 * * The above is the btf_verifier debug log: * - Each line started with "[?]" is a btf_type object * - [?] is the type_id of the btf_type object. * - CONST/PTR is the BTF_KIND_XXX * - "(anon)" is the name of the type. It just * happens that CONST and PTR has no name. * - type_id=XXX is the 'u32 type' in btf_type * * NOTE: "void" has type_id 0 * * String section: * ~~~~~~~~~~~~~~ * The BTF string section contains the names used by the type section. * Each string is referred by an "offset" from the beginning of the * string section. * * Each string is '\0' terminated. * * The first character in the string section must be '\0' * which is used to mean 'anonymous'. Some btf_type may not * have a name. */ /* BTF verification: * * To verify BTF data, two passes are needed. * * Pass #1 * ~~~~~~~ * The first pass is to collect all btf_type objects to * an array: "btf->types". * * Depending on the C type that a btf_type is describing, * a btf_type may be followed by extra data. We don't know * how many btf_type is there, and more importantly we don't * know where each btf_type is located in the type section. * * Without knowing the location of each type_id, most verifications * cannot be done. e.g. an earlier btf_type may refer to a later * btf_type (recall the "const void *" above), so we cannot * check this type-reference in the first pass. * * In the first pass, it still does some verifications (e.g. * checking the name is a valid offset to the string section). * * Pass #2 * ~~~~~~~ * The main focus is to resolve a btf_type that is referring * to another type. * * We have to ensure the referring type: * 1) does exist in the BTF (i.e. in btf->types[]) * 2) does not cause a loop: * struct A { * struct B b; * }; * * struct B { * struct A a; * }; * * btf_type_needs_resolve() decides if a btf_type needs * to be resolved. * * The needs_resolve type implements the "resolve()" ops which * essentially does a DFS and detects backedge. * * During resolve (or DFS), different C types have different * "RESOLVED" conditions. * * When resolving a BTF_KIND_STRUCT, we need to resolve all its * members because a member is always referring to another * type. A struct's member can be treated as "RESOLVED" if * it is referring to a BTF_KIND_PTR. Otherwise, the * following valid C struct would be rejected: * * struct A { * int m; * struct A *a; * }; * * When resolving a BTF_KIND_PTR, it needs to keep resolving if * it is referring to another BTF_KIND_PTR. Otherwise, we cannot * detect a pointer loop, e.g.: * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR + * ^ | * +-----------------------------------------+ * */ #define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2) #define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) #define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) #define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) #define BTF_INFO_MASK 0x9f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) /* 16MB for 64k structs and each has 16 members and * a few MB spaces for the string section. * The hard limit is S32_MAX. */ #define BTF_MAX_SIZE (16 * 1024 * 1024) #define for_each_member_from(i, from, struct_type, member) \ for (i = from, member = btf_type_member(struct_type) + from; \ i < btf_type_vlen(struct_type); \ i++, member++) #define for_each_vsi_from(i, from, struct_type, member) \ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ i < btf_type_vlen(struct_type); \ i++, member++) DEFINE_IDR(btf_idr); DEFINE_SPINLOCK(btf_idr_lock); enum btf_kfunc_hook { BTF_KFUNC_HOOK_COMMON, BTF_KFUNC_HOOK_XDP, BTF_KFUNC_HOOK_TC, BTF_KFUNC_HOOK_STRUCT_OPS, BTF_KFUNC_HOOK_TRACING, BTF_KFUNC_HOOK_SYSCALL, BTF_KFUNC_HOOK_FMODRET, BTF_KFUNC_HOOK_CGROUP, BTF_KFUNC_HOOK_SCHED_ACT, BTF_KFUNC_HOOK_SK_SKB, BTF_KFUNC_HOOK_SOCKET_FILTER, BTF_KFUNC_HOOK_LWT, BTF_KFUNC_HOOK_NETFILTER, BTF_KFUNC_HOOK_KPROBE, BTF_KFUNC_HOOK_MAX, }; enum { BTF_KFUNC_SET_MAX_CNT = 256, BTF_DTOR_KFUNC_MAX_CNT = 256, BTF_KFUNC_FILTER_MAX_CNT = 16, }; struct btf_kfunc_hook_filter { btf_kfunc_filter_t filters[BTF_KFUNC_FILTER_MAX_CNT]; u32 nr_filters; }; struct btf_kfunc_set_tab { struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX]; struct btf_kfunc_hook_filter hook_filters[BTF_KFUNC_HOOK_MAX]; }; struct btf_id_dtor_kfunc_tab { u32 cnt; struct btf_id_dtor_kfunc dtors[]; }; struct btf_struct_ops_tab { u32 cnt; u32 capacity; struct bpf_struct_ops_desc ops[]; }; struct btf { void *data; struct btf_type **types; u32 *resolved_ids; u32 *resolved_sizes; const char *strings; void *nohdr_data; struct btf_header hdr; u32 nr_types; /* includes VOID for base BTF */ u32 types_size; u32 data_size; refcount_t refcnt; u32 id; struct rcu_head rcu; struct btf_kfunc_set_tab *kfunc_set_tab; struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab; struct btf_struct_metas *struct_meta_tab; struct btf_struct_ops_tab *struct_ops_tab; /* split BTF support */ struct btf *base_btf; u32 start_id; /* first type ID in this BTF (0 for base BTF) */ u32 start_str_off; /* first string offset (0 for base BTF) */ char name[MODULE_NAME_LEN]; bool kernel_btf; __u32 *base_id_map; /* map from distilled base BTF -> vmlinux BTF ids */ }; enum verifier_phase { CHECK_META, CHECK_TYPE, }; struct resolve_vertex { const struct btf_type *t; u32 type_id; u16 next_member; }; enum visit_state { NOT_VISITED, VISITED, RESOLVED, }; enum resolve_mode { RESOLVE_TBD, /* To Be Determined */ RESOLVE_PTR, /* Resolving for Pointer */ RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union * or array */ }; #define MAX_RESOLVE_DEPTH 32 struct btf_sec_info { u32 off; u32 len; }; struct btf_verifier_env { struct btf *btf; u8 *visit_states; struct resolve_vertex stack[MAX_RESOLVE_DEPTH]; struct bpf_verifier_log log; u32 log_type_id; u32 top_stack; enum verifier_phase phase; enum resolve_mode resolve_mode; }; static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_UNKN] = "UNKNOWN", [BTF_KIND_INT] = "INT", [BTF_KIND_PTR] = "PTR", [BTF_KIND_ARRAY] = "ARRAY", [BTF_KIND_STRUCT] = "STRUCT", [BTF_KIND_UNION] = "UNION", [BTF_KIND_ENUM] = "ENUM", [BTF_KIND_FWD] = "FWD", [BTF_KIND_TYPEDEF] = "TYPEDEF", [BTF_KIND_VOLATILE] = "VOLATILE", [BTF_KIND_CONST] = "CONST", [BTF_KIND_RESTRICT] = "RESTRICT", [BTF_KIND_FUNC] = "FUNC", [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", [BTF_KIND_TYPE_TAG] = "TYPE_TAG", [BTF_KIND_ENUM64] = "ENUM64", }; const char *btf_type_str(const struct btf_type *t) { return btf_kind_str[BTF_INFO_KIND(t->info)]; } /* Chunk size we use in safe copy of data to be shown. */ #define BTF_SHOW_OBJ_SAFE_SIZE 32 /* * This is the maximum size of a base type value (equivalent to a * 128-bit int); if we are at the end of our safe buffer and have * less than 16 bytes space we can't be assured of being able * to copy the next type safely, so in such cases we will initiate * a new copy. */ #define BTF_SHOW_OBJ_BASE_TYPE_SIZE 16 /* Type name size */ #define BTF_SHOW_NAME_SIZE 80 /* * The suffix of a type that indicates it cannot alias another type when * comparing BTF IDs for kfunc invocations. */ #define NOCAST_ALIAS_SUFFIX "___init" /* * Common data to all BTF show operations. Private show functions can add * their own data to a structure containing a struct btf_show and consult it * in the show callback. See btf_type_show() below. * * One challenge with showing nested data is we want to skip 0-valued * data, but in order to figure out whether a nested object is all zeros * we need to walk through it. As a result, we need to make two passes * when handling structs, unions and arrays; the first path simply looks * for nonzero data, while the second actually does the display. The first * pass is signalled by show->state.depth_check being set, and if we * encounter a non-zero value we set show->state.depth_to_show to * the depth at which we encountered it. When we have completed the * first pass, we will know if anything needs to be displayed if * depth_to_show > depth. See btf_[struct,array]_show() for the * implementation of this. * * Another problem is we want to ensure the data for display is safe to * access. To support this, the anonymous "struct {} obj" tracks the data * object and our safe copy of it. We copy portions of the data needed * to the object "copy" buffer, but because its size is limited to * BTF_SHOW_OBJ_COPY_LEN bytes, multiple copies may be required as we * traverse larger objects for display. * * The various data type show functions all start with a call to * btf_show_start_type() which returns a pointer to the safe copy * of the data needed (or if BTF_SHOW_UNSAFE is specified, to the * raw data itself). btf_show_obj_safe() is responsible for * using copy_from_kernel_nofault() to update the safe data if necessary * as we traverse the object's data. skbuff-like semantics are * used: * * - obj.head points to the start of the toplevel object for display * - obj.size is the size of the toplevel object * - obj.data points to the current point in the original data at * which our safe data starts. obj.data will advance as we copy * portions of the data. * * In most cases a single copy will suffice, but larger data structures * such as "struct task_struct" will require many copies. The logic in * btf_show_obj_safe() handles the logic that determines if a new * copy_from_kernel_nofault() is needed. */ struct btf_show { u64 flags; void *target; /* target of show operation (seq file, buffer) */ __printf(2, 0) void (*showfn)(struct btf_show *show, const char *fmt, va_list args); const struct btf *btf; /* below are used during iteration */ struct { u8 depth; u8 depth_to_show; u8 depth_check; u8 array_member:1, array_terminated:1; u16 array_encoding; u32 type_id; int status; /* non-zero for error */ const struct btf_type *type; const struct btf_member *member; char name[BTF_SHOW_NAME_SIZE]; /* space for member name/type */ } state; struct { u32 size; void *head; void *data; u8 safe[BTF_SHOW_OBJ_SAFE_SIZE]; } obj; }; struct btf_kind_operations { s32 (*check_meta)(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left); int (*resolve)(struct btf_verifier_env *env, const struct resolve_vertex *v); int (*check_member)(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type); int (*check_kflag_member)(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); void (*show)(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offsets, struct btf_show *show); }; static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; static struct btf_type btf_void; static int btf_resolve(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id); static int btf_func_check(struct btf_verifier_env *env, const struct btf_type *t); static bool btf_type_is_modifier(const struct btf_type *t) { /* Some of them is not strictly a C modifier * but they are grouped into the same bucket * for BTF concern: * A type (t) that refers to another * type through t->type AND its size cannot * be determined without following the t->type. * * ptr does not fall into this bucket * because its size is always sizeof(void *). */ switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_TYPEDEF: case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: case BTF_KIND_TYPE_TAG: return true; } return false; } bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void; } static bool btf_type_is_fwd(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; } static bool btf_type_is_datasec(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } static bool btf_type_is_decl_tag(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG; } static bool btf_type_nosize(const struct btf_type *t) { return btf_type_is_void(t) || btf_type_is_fwd(t) || btf_type_is_func(t) || btf_type_is_func_proto(t) || btf_type_is_decl_tag(t); } static bool btf_type_nosize_or_null(const struct btf_type *t) { return !t || btf_type_nosize(t); } static bool btf_type_is_decl_tag_target(const struct btf_type *t) { return btf_type_is_func(t) || btf_type_is_struct(t) || btf_type_is_var(t) || btf_type_is_typedef(t); } bool btf_is_vmlinux(const struct btf *btf) { return btf->kernel_btf && !btf->base_btf; } u32 btf_nr_types(const struct btf *btf) { u32 total = 0; while (btf) { total += btf->nr_types; btf = btf->base_btf; } return total; } s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) { const struct btf_type *t; const char *tname; u32 i, total; total = btf_nr_types(btf); for (i = 1; i < total; i++) { t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != kind) continue; tname = btf_name_by_offset(btf, t->name_off); if (!strcmp(tname, name)) return i; } return -ENOENT; } s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) { struct btf *btf; s32 ret; int id; btf = bpf_get_btf_vmlinux(); if (IS_ERR(btf)) return PTR_ERR(btf); if (!btf) return -EINVAL; ret = btf_find_by_name_kind(btf, name, kind); /* ret is never zero, since btf_find_by_name_kind returns * positive btf_id or negative error. */ if (ret > 0) { btf_get(btf); *btf_p = btf; return ret; } /* If name is not found in vmlinux's BTF then search in module's BTFs */ spin_lock_bh(&btf_idr_lock); idr_for_each_entry(&btf_idr, btf, id) { if (!btf_is_module(btf)) continue; /* linear search could be slow hence unlock/lock * the IDR to avoiding holding it for too long */ btf_get(btf); spin_unlock_bh(&btf_idr_lock); ret = btf_find_by_name_kind(btf, name, kind); if (ret > 0) { *btf_p = btf; return ret; } btf_put(btf); spin_lock_bh(&btf_idr_lock); } spin_unlock_bh(&btf_idr_lock); return ret; } const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, u32 id, u32 *res_id) { const struct btf_type *t = btf_type_by_id(btf, id); while (btf_type_is_modifier(t)) { id = t->type; t = btf_type_by_id(btf, t->type); } if (res_id) *res_id = id; return t; } const struct btf_type *btf_type_resolve_ptr(const struct btf *btf, u32 id, u32 *res_id) { const struct btf_type *t; t = btf_type_skip_modifiers(btf, id, NULL); if (!btf_type_is_ptr(t)) return NULL; return btf_type_skip_modifiers(btf, t->type, res_id); } const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, u32 id, u32 *res_id) { const struct btf_type *ptype; ptype = btf_type_resolve_ptr(btf, id, res_id); if (ptype && btf_type_is_func_proto(ptype)) return ptype; return NULL; } /* Types that act only as a source, not sink or intermediate * type when resolving. */ static bool btf_type_is_resolve_source_only(const struct btf_type *t) { return btf_type_is_var(t) || btf_type_is_decl_tag(t) || btf_type_is_datasec(t); } /* What types need to be resolved? * * btf_type_is_modifier() is an obvious one. * * btf_type_is_struct() because its member refers to * another type (through member->type). * * btf_type_is_var() because the variable refers to * another type. btf_type_is_datasec() holds multiple * btf_type_is_var() types that need resolving. * * btf_type_is_array() because its element (array->type) * refers to another type. Array can be thought of a * special case of struct while array just has the same * member-type repeated by array->nelems of times. */ static bool btf_type_needs_resolve(const struct btf_type *t) { return btf_type_is_modifier(t) || btf_type_is_ptr(t) || btf_type_is_struct(t) || btf_type_is_array(t) || btf_type_is_var(t) || btf_type_is_func(t) || btf_type_is_decl_tag(t) || btf_type_is_datasec(t); } /* t->size can be used */ static bool btf_type_has_size(const struct btf_type *t) { switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_INT: case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_DATASEC: case BTF_KIND_FLOAT: case BTF_KIND_ENUM64: return true; } return false; } static const char *btf_int_encoding_str(u8 encoding) { if (encoding == 0) return "(none)"; else if (encoding == BTF_INT_SIGNED) return "SIGNED"; else if (encoding == BTF_INT_CHAR) return "CHAR"; else if (encoding == BTF_INT_BOOL) return "BOOL"; else return "UNKN"; } static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); } static const struct btf_array *btf_type_array(const struct btf_type *t) { return (const struct btf_array *)(t + 1); } static const struct btf_enum *btf_type_enum(const struct btf_type *t) { return (const struct btf_enum *)(t + 1); } static const struct btf_var *btf_type_var(const struct btf_type *t) { return (const struct btf_var *)(t + 1); } static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t) { return (const struct btf_decl_tag *)(t + 1); } static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t) { return (const struct btf_enum64 *)(t + 1); } static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; } static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { if (!BTF_STR_OFFSET_VALID(offset)) return false; while (offset < btf->start_str_off) btf = btf->base_btf; offset -= btf->start_str_off; return offset < btf->hdr.str_len; } static bool __btf_name_char_ok(char c, bool first) { if ((first ? !isalpha(c) : !isalnum(c)) && c != '_' && c != '.') return false; return true; } const char *btf_str_by_offset(const struct btf *btf, u32 offset) { while (offset < btf->start_str_off) btf = btf->base_btf; offset -= btf->start_str_off; if (offset < btf->hdr.str_len) return &btf->strings[offset]; return NULL; } static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); const char *src_limit; if (!__btf_name_char_ok(*src, true)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { if (!__btf_name_char_ok(*src, false)) return false; src++; } return !*src; } /* Allow any printable character in DATASEC names */ static bool btf_name_valid_section(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); const char *src_limit; if (!*src) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; while (*src && src < src_limit) { if (!isprint(*src)) return false; src++; } return !*src; } static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { const char *name; if (!offset) return "(anon)"; name = btf_str_by_offset(btf, offset); return name ?: "(invalid-name-offset)"; } const char *btf_name_by_offset(const struct btf *btf, u32 offset) { return btf_str_by_offset(btf, offset); } const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { while (type_id < btf->start_id) btf = btf->base_btf; type_id -= btf->start_id; if (type_id >= btf->nr_types) return NULL; return btf->types[type_id]; } EXPORT_SYMBOL_GPL(btf_type_by_id); /* * Regular int is not a bit field and it must be either * u8/u16/u32/u64 or __int128. */ static bool btf_type_int_is_regular(const struct btf_type *t) { u8 nr_bits, nr_bytes; u32 int_data; int_data = btf_type_int(t); nr_bits = BTF_INT_BITS(int_data); nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); if (BITS_PER_BYTE_MASKED(nr_bits) || BTF_INT_OFFSET(int_data) || (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && nr_bytes != (2 * sizeof(u64)))) { return false; } return true; } /* * Check that given struct member is a regular int with expected * offset and size. */ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size) { const struct btf_type *t; u32 id, int_data; u8 nr_bits; id = m->type; t = btf_type_id_size(btf, &id, NULL); if (!t || !btf_type_is_int(t)) return false; int_data = btf_type_int(t); nr_bits = BTF_INT_BITS(int_data); if (btf_type_kflag(s)) { u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset); u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset); /* if kflag set, int should be a regular int and * bit offset should be at byte boundary. */ return !bitfield_size && BITS_ROUNDUP_BYTES(bit_offset) == expected_offset && BITS_ROUNDUP_BYTES(nr_bits) == expected_size; } if (BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(m->offset) || BITS_ROUNDUP_BYTES(m->offset) != expected_offset || BITS_PER_BYTE_MASKED(nr_bits) || BITS_ROUNDUP_BYTES(nr_bits) != expected_size) return false; return true; } /* Similar to btf_type_skip_modifiers() but does not skip typedefs. */ static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf, u32 id) { const struct btf_type *t = btf_type_by_id(btf, id); while (btf_type_is_modifier(t) && BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { t = btf_type_by_id(btf, t->type); } return t; } #define BTF_SHOW_MAX_ITER 10 #define BTF_KIND_BIT(kind) (1ULL << kind) /* * Populate show->state.name with type name information. * Format of type name is * * [.member_name = ] (type_name) */ static const char *btf_show_name(struct btf_show *show) { /* BTF_MAX_ITER array suffixes "[]" */ const char *array_suffixes = "[][][][][][][][][][]"; const char *array_suffix = &array_suffixes[strlen(array_suffixes)]; /* BTF_MAX_ITER pointer suffixes "*" */ const char *ptr_suffixes = "**********"; const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)]; const char *name = NULL, *prefix = "", *parens = ""; const struct btf_member *m = show->state.member; const struct btf_type *t; const struct btf_array *array; u32 id = show->state.type_id; const char *member = NULL; bool show_member = false; u64 kinds = 0; int i; show->state.name[0] = '\0'; /* * Don't show type name if we're showing an array member; * in that case we show the array type so don't need to repeat * ourselves for each member. */ if (show->state.array_member) return ""; /* Retrieve member name, if any. */ if (m) { member = btf_name_by_offset(show->btf, m->name_off); show_member = strlen(member) > 0; id = m->type; } /* * Start with type_id, as we have resolved the struct btf_type * * via btf_modifier_show() past the parent typedef to the child * struct, int etc it is defined as. In such cases, the type_id * still represents the starting type while the struct btf_type * * in our show->state points at the resolved type of the typedef. */ t = btf_type_by_id(show->btf, id); if (!t) return ""; /* * The goal here is to build up the right number of pointer and * array suffixes while ensuring the type name for a typedef * is represented. Along the way we accumulate a list of * BTF kinds we have encountered, since these will inform later * display; for example, pointer types will not require an * opening "{" for struct, we will just display the pointer value. * * We also want to accumulate the right number of pointer or array * indices in the format string while iterating until we get to * the typedef/pointee/array member target type. * * We start by pointing at the end of pointer and array suffix * strings; as we accumulate pointers and arrays we move the pointer * or array string backwards so it will show the expected number of * '*' or '[]' for the type. BTF_SHOW_MAX_ITER of nesting of pointers * and/or arrays and typedefs are supported as a precaution. * * We also want to get typedef name while proceeding to resolve * type it points to so that we can add parentheses if it is a * "typedef struct" etc. */ for (i = 0; i < BTF_SHOW_MAX_ITER; i++) { switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_TYPEDEF: if (!name) name = btf_name_by_offset(show->btf, t->name_off); kinds |= BTF_KIND_BIT(BTF_KIND_TYPEDEF); id = t->type; break; case BTF_KIND_ARRAY: kinds |= BTF_KIND_BIT(BTF_KIND_ARRAY); parens = "["; if (!t) return ""; array = btf_type_array(t); if (array_suffix > array_suffixes) array_suffix -= 2; id = array->type; break; case BTF_KIND_PTR: kinds |= BTF_KIND_BIT(BTF_KIND_PTR); if (ptr_suffix > ptr_suffixes) ptr_suffix -= 1; id = t->type; break; default: id = 0; break; } if (!id) break; t = btf_type_skip_qualifiers(show->btf, id); } /* We may not be able to represent this type; bail to be safe */ if (i == BTF_SHOW_MAX_ITER) return ""; if (!name) name = btf_name_by_offset(show->btf, t->name_off); switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_STRUCT: case BTF_KIND_UNION: prefix = BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT ? "struct" : "union"; /* if it's an array of struct/union, parens is already set */ if (!(kinds & (BTF_KIND_BIT(BTF_KIND_ARRAY)))) parens = "{"; break; case BTF_KIND_ENUM: case BTF_KIND_ENUM64: prefix = "enum"; break; default: break; } /* pointer does not require parens */ if (kinds & BTF_KIND_BIT(BTF_KIND_PTR)) parens = ""; /* typedef does not require struct/union/enum prefix */ if (kinds & BTF_KIND_BIT(BTF_KIND_TYPEDEF)) prefix = ""; if (!name) name = ""; /* Even if we don't want type name info, we want parentheses etc */ if (show->flags & BTF_SHOW_NONAME) snprintf(show->state.name, sizeof(show->state.name), "%s", parens); else snprintf(show->state.name, sizeof(show->state.name), "%s%s%s(%s%s%s%s%s%s)%s", /* first 3 strings comprise ".member = " */ show_member ? "." : "", show_member ? member : "", show_member ? " = " : "", /* ...next is our prefix (struct, enum, etc) */ prefix, strlen(prefix) > 0 && strlen(name) > 0 ? " " : "", /* ...this is the type name itself */ name, /* ...suffixed by the appropriate '*', '[]' suffixes */ strlen(ptr_suffix) > 0 ? " " : "", ptr_suffix, array_suffix, parens); return show->state.name; } static const char *__btf_show_indent(struct btf_show *show) { const char *indents = " "; const char *indent = &indents[strlen(indents)]; if ((indent - show->state.depth) >= indents) return indent - show->state.depth; return indents; } static const char *btf_show_indent(struct btf_show *show) { return show->flags & BTF_SHOW_COMPACT ? "" : __btf_show_indent(show); } static const char *btf_show_newline(struct btf_show *show) { return show->flags & BTF_SHOW_COMPACT ? "" : "\n"; } static const char *btf_show_delim(struct btf_show *show) { if (show->state.depth == 0) return ""; if ((show->flags & BTF_SHOW_COMPACT) && show->state.type && BTF_INFO_KIND(show->state.type->info) == BTF_KIND_UNION) return "|"; return ","; } __printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...) { va_list args; if (!show->state.depth_check) { va_start(args, fmt); show->showfn(show, fmt, args); va_end(args); } } /* Macros are used here as btf_show_type_value[s]() prepends and appends * format specifiers to the format specifier passed in; these do the work of * adding indentation, delimiters etc while the caller simply has to specify * the type value(s) in the format specifier + value(s). */ #define btf_show_type_value(show, fmt, value) \ do { \ if ((value) != (__typeof__(value))0 || \ (show->flags & BTF_SHOW_ZERO) || \ show->state.depth == 0) { \ btf_show(show, "%s%s" fmt "%s%s", \ btf_show_indent(show), \ btf_show_name(show), \ value, btf_show_delim(show), \ btf_show_newline(show)); \ if (show->state.depth > show->state.depth_to_show) \ show->state.depth_to_show = show->state.depth; \ } \ } while (0) #define btf_show_type_values(show, fmt, ...) \ do { \ btf_show(show, "%s%s" fmt "%s%s", btf_show_indent(show), \ btf_show_name(show), \ __VA_ARGS__, btf_show_delim(show), \ btf_show_newline(show)); \ if (show->state.depth > show->state.depth_to_show) \ show->state.depth_to_show = show->state.depth; \ } while (0) /* How much is left to copy to safe buffer after @data? */ static int btf_show_obj_size_left(struct btf_show *show, void *data) { return show->obj.head + show->obj.size - data; } /* Is object pointed to by @data of @size already copied to our safe buffer? */ static bool btf_show_obj_is_safe(struct btf_show *show, void *data, int size) { return data >= show->obj.data && (data + size) < (show->obj.data + BTF_SHOW_OBJ_SAFE_SIZE); } /* * If object pointed to by @data of @size falls within our safe buffer, return * the equivalent pointer to the same safe data. Assumes * copy_from_kernel_nofault() has already happened and our safe buffer is * populated. */ static void *__btf_show_obj_safe(struct btf_show *show, void *data, int size) { if (btf_show_obj_is_safe(show, data, size)) return show->obj.safe + (data - show->obj.data); return NULL; } /* * Return a safe-to-access version of data pointed to by @data. * We do this by copying the relevant amount of information * to the struct btf_show obj.safe buffer using copy_from_kernel_nofault(). * * If BTF_SHOW_UNSAFE is specified, just return data as-is; no * safe copy is needed. * * Otherwise we need to determine if we have the required amount * of data (determined by the @data pointer and the size of the * largest base type we can encounter (represented by * BTF_SHOW_OBJ_BASE_TYPE_SIZE). Having that much data ensures * that we will be able to print some of the current object, * and if more is needed a copy will be triggered. * Some objects such as structs will not fit into the buffer; * in such cases additional copies when we iterate over their * members may be needed. * * btf_show_obj_safe() is used to return a safe buffer for * btf_show_start_type(); this ensures that as we recurse into * nested types we always have safe data for the given type. * This approach is somewhat wasteful; it's possible for example * that when iterating over a large union we'll end up copying the * same data repeatedly, but the goal is safety not performance. * We use stack data as opposed to per-CPU buffers because the * iteration over a type can take some time, and preemption handling * would greatly complicate use of the safe buffer. */ static void *btf_show_obj_safe(struct btf_show *show, const struct btf_type *t, void *data) { const struct btf_type *rt; int size_left, size; void *safe = NULL; if (show->flags & BTF_SHOW_UNSAFE) return data; rt = btf_resolve_size(show->btf, t, &size); if (IS_ERR(rt)) { show->state.status = PTR_ERR(rt); return NULL; } /* * Is this toplevel object? If so, set total object size and * initialize pointers. Otherwise check if we still fall within * our safe object data. */ if (show->state.depth == 0) { show->obj.size = size; show->obj.head = data; } else { /* * If the size of the current object is > our remaining * safe buffer we _may_ need to do a new copy. However * consider the case of a nested struct; it's size pushes * us over the safe buffer limit, but showing any individual * struct members does not. In such cases, we don't need * to initiate a fresh copy yet; however we definitely need * at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes left * in our buffer, regardless of the current object size. * The logic here is that as we resolve types we will * hit a base type at some point, and we need to be sure * the next chunk of data is safely available to display * that type info safely. We cannot rely on the size of * the current object here because it may be much larger * than our current buffer (e.g. task_struct is 8k). * All we want to do here is ensure that we can print the * next basic type, which we can if either * - the current type size is within the safe buffer; or * - at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes are left in * the safe buffer. */ safe = __btf_show_obj_safe(show, data, min(size, BTF_SHOW_OBJ_BASE_TYPE_SIZE)); } /* * We need a new copy to our safe object, either because we haven't * yet copied and are initializing safe data, or because the data * we want falls outside the boundaries of the safe object. */ if (!safe) { size_left = btf_show_obj_size_left(show, data); if (size_left > BTF_SHOW_OBJ_SAFE_SIZE) size_left = BTF_SHOW_OBJ_SAFE_SIZE; show->state.status = copy_from_kernel_nofault(show->obj.safe, data, size_left); if (!show->state.status) { show->obj.data = data; safe = show->obj.safe; } } return safe; } /* * Set the type we are starting to show and return a safe data pointer * to be used for showing the associated data. */ static void *btf_show_start_type(struct btf_show *show, const struct btf_type *t, u32 type_id, void *data) { show->state.type = t; show->state.type_id = type_id; show->state.name[0] = '\0'; return btf_show_obj_safe(show, t, data); } static void btf_show_end_type(struct btf_show *show) { show->state.type = NULL; show->state.type_id = 0; show->state.name[0] = '\0'; } static void *btf_show_start_aggr_type(struct btf_show *show, const struct btf_type *t, u32 type_id, void *data) { void *safe_data = btf_show_start_type(show, t, type_id, data); if (!safe_data) return safe_data; btf_show(show, "%s%s%s", btf_show_indent(show), btf_show_name(show), btf_show_newline(show)); show->state.depth++; return safe_data; } static void btf_show_end_aggr_type(struct btf_show *show, const char *suffix) { show->state.depth--; btf_show(show, "%s%s%s%s", btf_show_indent(show), suffix, btf_show_delim(show), btf_show_newline(show)); btf_show_end_type(show); } static void btf_show_start_member(struct btf_show *show, const struct btf_member *m) { show->state.member = m; } static void btf_show_start_array_member(struct btf_show *show) { show->state.array_member = 1; btf_show_start_member(show, NULL); } static void btf_show_end_member(struct btf_show *show) { show->state.member = NULL; } static void btf_show_end_array_member(struct btf_show *show) { show->state.array_member = 0; btf_show_end_member(show); } static void *btf_show_start_array_type(struct btf_show *show, const struct btf_type *t, u32 type_id, u16 array_encoding, void *data) { show->state.array_encoding = array_encoding; show->state.array_terminated = 0; return btf_show_start_aggr_type(show, t, type_id, data); } static void btf_show_end_array_type(struct btf_show *show) { show->state.array_encoding = 0; show->state.array_terminated = 0; btf_show_end_aggr_type(show, "]"); } static void *btf_show_start_struct_type(struct btf_show *show, const struct btf_type *t, u32 type_id, void *data) { return btf_show_start_aggr_type(show, t, type_id, data); } static void btf_show_end_struct_type(struct btf_show *show) { btf_show_end_aggr_type(show, "}"); } __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { va_list args; va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } __printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; va_list args; if (!bpf_verifier_log_needed(log)) return; va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, const struct btf_type *t, bool log_details, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; struct btf *btf = env->btf; va_list args; if (!bpf_verifier_log_needed(log)) return; if (log->level == BPF_LOG_KERNEL) { /* btf verifier prints all types it is processing via * btf_verifier_log_type(..., fmt = NULL). * Skip those prints for in-kernel BTF verification. */ if (!fmt) return; /* Skip logging when loading module BTF with mismatches permitted */ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) return; } __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); if (log_details) btf_type_ops(t)->log_details(env, t); if (fmt && *fmt) { __btf_verifier_log(log, " "); va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } __btf_verifier_log(log, "\n"); } #define btf_verifier_log_type(env, t, ...) \ __btf_verifier_log_type((env), (t), true, __VA_ARGS__) #define btf_verifier_log_basic(env, t, ...) \ __btf_verifier_log_type((env), (t), false, __VA_ARGS__) __printf(4, 5) static void btf_verifier_log_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; struct btf *btf = env->btf; va_list args; if (!bpf_verifier_log_needed(log)) return; if (log->level == BPF_LOG_KERNEL) { if (!fmt) return; /* Skip logging when loading module BTF with mismatches permitted */ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) return; } /* The CHECK_META phase already did a btf dump. * * If member is logged again, it must hit an error in * parsing this member. It is useful to print out which * struct this member belongs to. */ if (env->phase != CHECK_META) btf_verifier_log_type(env, struct_type, NULL); if (btf_type_kflag(struct_type)) __btf_verifier_log(log, "\t%s type_id=%u bitfield_size=%u bits_offset=%u", __btf_name_by_offset(btf, member->name_off), member->type, BTF_MEMBER_BITFIELD_SIZE(member->offset), BTF_MEMBER_BIT_OFFSET(member->offset)); else __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", __btf_name_by_offset(btf, member->name_off), member->type, member->offset); if (fmt && *fmt) { __btf_verifier_log(log, " "); va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } __btf_verifier_log(log, "\n"); } __printf(4, 5) static void btf_verifier_log_vsi(struct btf_verifier_env *env, const struct btf_type *datasec_type, const struct btf_var_secinfo *vsi, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; va_list args; if (!bpf_verifier_log_needed(log)) return; if (log->level == BPF_LOG_KERNEL && !fmt) return; if (env->phase != CHECK_META) btf_verifier_log_type(env, datasec_type, NULL); __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u", vsi->type, vsi->offset, vsi->size); if (fmt && *fmt) { __btf_verifier_log(log, " "); va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } __btf_verifier_log(log, "\n"); } static void btf_verifier_log_hdr(struct btf_verifier_env *env, u32 btf_data_size) { struct bpf_verifier_log *log = &env->log; const struct btf *btf = env->btf; const struct btf_header *hdr; if (!bpf_verifier_log_needed(log)) return; if (log->level == BPF_LOG_KERNEL) return; hdr = &btf->hdr; __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); __btf_verifier_log(log, "version: %u\n", hdr->version); __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len); __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); __btf_verifier_log(log, "type_len: %u\n", hdr->type_len); __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size); } static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) { struct btf *btf = env->btf; if (btf->types_size == btf->nr_types) { /* Expand 'types' array */ struct btf_type **new_types; u32 expand_by, new_size; if (btf->start_id + btf->types_size == BTF_MAX_TYPE) { btf_verifier_log(env, "Exceeded max num of types"); return -E2BIG; } expand_by = max_t(u32, btf->types_size >> 2, 16); new_size = min_t(u32, BTF_MAX_TYPE, btf->types_size + expand_by); new_types = kvcalloc(new_size, sizeof(*new_types), GFP_KERNEL | __GFP_NOWARN); if (!new_types) return -ENOMEM; if (btf->nr_types == 0) { if (!btf->base_btf) { /* lazily init VOID type */ new_types[0] = &btf_void; btf->nr_types++; } } else { memcpy(new_types, btf->types, sizeof(*btf->types) * btf->nr_types); } kvfree(btf->types); btf->types = new_types; btf->types_size = new_size; } btf->types[btf->nr_types++] = t; return 0; } static int btf_alloc_id(struct btf *btf) { int id; idr_preload(GFP_KERNEL); spin_lock_bh(&btf_idr_lock); id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC); if (id > 0) btf->id = id; spin_unlock_bh(&btf_idr_lock); idr_preload_end(); if (WARN_ON_ONCE(!id)) return -ENOSPC; return id > 0 ? 0 : id; } static void btf_free_id(struct btf *btf) { unsigned long flags; /* * In map-in-map, calling map_delete_elem() on outer * map will call bpf_map_put on the inner map. * It will then eventually call btf_free_id() * on the inner map. Some of the map_delete_elem() * implementation may have irq disabled, so * we need to use the _irqsave() version instead * of the _bh() version. */ spin_lock_irqsave(&btf_idr_lock, flags); idr_remove(&btf_idr, btf->id); spin_unlock_irqrestore(&btf_idr_lock, flags); } static void btf_free_kfunc_set_tab(struct btf *btf) { struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab; int hook; if (!tab) return; for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) kfree(tab->sets[hook]); kfree(tab); btf->kfunc_set_tab = NULL; } static void btf_free_dtor_kfunc_tab(struct btf *btf) { struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; if (!tab) return; kfree(tab); btf->dtor_kfunc_tab = NULL; } static void btf_struct_metas_free(struct btf_struct_metas *tab) { int i; if (!tab) return; for (i = 0; i < tab->cnt; i++) btf_record_free(tab->types[i].record); kfree(tab); } static void btf_free_struct_meta_tab(struct btf *btf) { struct btf_struct_metas *tab = btf->struct_meta_tab; btf_struct_metas_free(tab); btf->struct_meta_tab = NULL; } static void btf_free_struct_ops_tab(struct btf *btf) { struct btf_struct_ops_tab *tab = btf->struct_ops_tab; u32 i; if (!tab) return; for (i = 0; i < tab->cnt; i++) bpf_struct_ops_desc_release(&tab->ops[i]); kfree(tab); btf->struct_ops_tab = NULL; } static void btf_free(struct btf *btf) { btf_free_struct_meta_tab(btf); btf_free_dtor_kfunc_tab(btf); btf_free_kfunc_set_tab(btf); btf_free_struct_ops_tab(btf); kvfree(btf->types); kvfree(btf->resolved_sizes); kvfree(btf->resolved_ids); /* vmlinux does not allocate btf->data, it simply points it at * __start_BTF. */ if (!btf_is_vmlinux(btf)) kvfree(btf->data); kvfree(btf->base_id_map); kfree(btf); } static void btf_free_rcu(struct rcu_head *rcu) { struct btf *btf = container_of(rcu, struct btf, rcu); btf_free(btf); } const char *btf_get_name(const struct btf *btf) { return btf->name; } void btf_get(struct btf *btf) { refcount_inc(&btf->refcnt); } void btf_put(struct btf *btf) { if (btf && refcount_dec_and_test(&btf->refcnt)) { btf_free_id(btf); call_rcu(&btf->rcu, btf_free_rcu); } } struct btf *btf_base_btf(const struct btf *btf) { return btf->base_btf; } const struct btf_header *btf_header(const struct btf *btf) { return &btf->hdr; } void btf_set_base_btf(struct btf *btf, const struct btf *base_btf) { btf->base_btf = (struct btf *)base_btf; btf->start_id = btf_nr_types(base_btf); btf->start_str_off = base_btf->hdr.str_len; } static int env_resolve_init(struct btf_verifier_env *env) { struct btf *btf = env->btf; u32 nr_types = btf->nr_types; u32 *resolved_sizes = NULL; u32 *resolved_ids = NULL; u8 *visit_states = NULL; resolved_sizes = kvcalloc(nr_types, sizeof(*resolved_sizes), GFP_KERNEL | __GFP_NOWARN); if (!resolved_sizes) goto nomem; resolved_ids = kvcalloc(nr_types, sizeof(*resolved_ids), GFP_KERNEL | __GFP_NOWARN); if (!resolved_ids) goto nomem; visit_states = kvcalloc(nr_types, sizeof(*visit_states), GFP_KERNEL | __GFP_NOWARN); if (!visit_states) goto nomem; btf->resolved_sizes = resolved_sizes; btf->resolved_ids = resolved_ids; env->visit_states = visit_states; return 0; nomem: kvfree(resolved_sizes); kvfree(resolved_ids); kvfree(visit_states); return -ENOMEM; } static void btf_verifier_env_free(struct btf_verifier_env *env) { kvfree(env->visit_states); kfree(env); } static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, const struct btf_type *next_type) { switch (env->resolve_mode) { case RESOLVE_TBD: /* int, enum or void is a sink */ return !btf_type_needs_resolve(next_type); case RESOLVE_PTR: /* int, enum, void, struct, array, func or func_proto is a sink * for ptr */ return !btf_type_is_modifier(next_type) && !btf_type_is_ptr(next_type); case RESOLVE_STRUCT_OR_ARRAY: /* int, enum, void, ptr, func or func_proto is a sink * for struct and array */ return !btf_type_is_modifier(next_type) && !btf_type_is_array(next_type) && !btf_type_is_struct(next_type); default: BUG(); } } static bool env_type_is_resolved(const struct btf_verifier_env *env, u32 type_id) { /* base BTF types should be resolved by now */ if (type_id < env->btf->start_id) return true; return env->visit_states[type_id - env->btf->start_id] == RESOLVED; } static int env_stack_push(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) { const struct btf *btf = env->btf; struct resolve_vertex *v; if (env->top_stack == MAX_RESOLVE_DEPTH) return -E2BIG; if (type_id < btf->start_id || env->visit_states[type_id - btf->start_id] != NOT_VISITED) return -EEXIST; env->visit_states[type_id - btf->start_id] = VISITED; v = &env->stack[env->top_stack++]; v->t = t; v->type_id = type_id; v->next_member = 0; if (env->resolve_mode == RESOLVE_TBD) { if (btf_type_is_ptr(t)) env->resolve_mode = RESOLVE_PTR; else if (btf_type_is_struct(t) || btf_type_is_array(t)) env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY; } return 0; } static void env_stack_set_next_member(struct btf_verifier_env *env, u16 next_member) { env->stack[env->top_stack - 1].next_member = next_member; } static void env_stack_pop_resolved(struct btf_verifier_env *env, u32 resolved_type_id, u32 resolved_size) { u32 type_id = env->stack[--(env->top_stack)].type_id; struct btf *btf = env->btf; type_id -= btf->start_id; /* adjust to local type id */ btf->resolved_sizes[type_id] = resolved_size; btf->resolved_ids[type_id] = resolved_type_id; env->visit_states[type_id] = RESOLVED; } static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) { return env->top_stack ? &env->stack[env->top_stack - 1] : NULL; } /* Resolve the size of a passed-in "type" * * type: is an array (e.g. u32 array[x][y]) * return type: type "u32[x][y]", i.e. BTF_KIND_ARRAY, * *type_size: (x * y * sizeof(u32)). Hence, *type_size always * corresponds to the return type. * *elem_type: u32 * *elem_id: id of u32 * *total_nelems: (x * y). Hence, individual elem size is * (*type_size / *total_nelems) * *type_id: id of type if it's changed within the function, 0 if not * * type: is not an array (e.g. const struct X) * return type: type "struct X" * *type_size: sizeof(struct X) * *elem_type: same as return type ("struct X") * *elem_id: 0 * *total_nelems: 1 * *type_id: id of type if it's changed within the function, 0 if not */ static const struct btf_type * __btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size, const struct btf_type **elem_type, u32 *elem_id, u32 *total_nelems, u32 *type_id) { const struct btf_type *array_type = NULL; const struct btf_array *array = NULL; u32 i, size, nelems = 1, id = 0; for (i = 0; i < MAX_RESOLVE_DEPTH; i++) { switch (BTF_INFO_KIND(type->info)) { /* type->size can be used */ case BTF_KIND_INT: case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_FLOAT: case BTF_KIND_ENUM64: size = type->size; goto resolved; case BTF_KIND_PTR: size = sizeof(void *); goto resolved; /* Modifiers */ case BTF_KIND_TYPEDEF: case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: case BTF_KIND_TYPE_TAG: id = type->type; type = btf_type_by_id(btf, type->type); break; case BTF_KIND_ARRAY: if (!array_type) array_type = type; array = btf_type_array(type); if (nelems && array->nelems > U32_MAX / nelems) return ERR_PTR(-EINVAL); nelems *= array->nelems; type = btf_type_by_id(btf, array->type); break; /* type without size */ default: return ERR_PTR(-EINVAL); } } return ERR_PTR(-EINVAL); resolved: if (nelems && size > U32_MAX / nelems) return ERR_PTR(-EINVAL); *type_size = nelems * size; if (total_nelems) *total_nelems = nelems; if (elem_type) *elem_type = type; if (elem_id) *elem_id = array ? array->type : 0; if (type_id && id) *type_id = id; return array_type ? : type; } const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size) { return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL); } static u32 btf_resolved_type_id(const struct btf *btf, u32 type_id) { while (type_id < btf->start_id) btf = btf->base_btf; return btf->resolved_ids[type_id - btf->start_id]; } /* The input param "type_id" must point to a needs_resolve type */ static const struct btf_type *btf_type_id_resolve(const struct btf *btf, u32 *type_id) { *type_id = btf_resolved_type_id(btf, *type_id); return btf_type_by_id(btf, *type_id); } static u32 btf_resolved_type_size(const struct btf *btf, u32 type_id) { while (type_id < btf->start_id) btf = btf->base_btf; return btf->resolved_sizes[type_id - btf->start_id]; } const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size) { const struct btf_type *size_type; u32 size_type_id = *type_id; u32 size = 0; size_type = btf_type_by_id(btf, size_type_id); if (btf_type_nosize_or_null(size_type)) return NULL; if (btf_type_has_size(size_type)) { size = size_type->size; } else if (btf_type_is_array(size_type)) { size = btf_resolved_type_size(btf, size_type_id); } else if (btf_type_is_ptr(size_type)) { size = sizeof(void *); } else { if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) && !btf_type_is_var(size_type))) return NULL; size_type_id = btf_resolved_type_id(btf, size_type_id); size_type = btf_type_by_id(btf, size_type_id); if (btf_type_nosize_or_null(size_type)) return NULL; else if (btf_type_has_size(size_type)) size = size_type->size; else if (btf_type_is_array(size_type)) size = btf_resolved_type_size(btf, size_type_id); else if (btf_type_is_ptr(size_type)) size = sizeof(void *); else return NULL; } *type_id = size_type_id; if (ret_size) *ret_size = size; return size_type; } static int btf_df_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { btf_verifier_log_basic(env, struct_type, "Unsupported check_member"); return -EINVAL; } static int btf_df_check_kflag_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { btf_verifier_log_basic(env, struct_type, "Unsupported check_kflag_member"); return -EINVAL; } /* Used for ptr, array struct/union and float type members. * int, enum and modifier types have their specific callback functions. */ static int btf_generic_check_kflag_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) { btf_verifier_log_member(env, struct_type, member, "Invalid member bitfield_size"); return -EINVAL; } /* bitfield size is 0, so member->offset represents bit offset only. * It is safe to call non kflag check_member variants. */ return btf_type_ops(member_type)->check_member(env, struct_type, member, member_type); } static int btf_df_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { btf_verifier_log_basic(env, v->t, "Unsupported resolve"); return -EINVAL; } static void btf_df_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offsets, struct btf_show *show) { btf_show(show, "<unsupported kind:%u>", BTF_INFO_KIND(t->info)); } static int btf_int_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 int_data = btf_type_int(member_type); u32 struct_bits_off = member->offset; u32 struct_size = struct_type->size; u32 nr_copy_bits; u32 bytes_offset; if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) { btf_verifier_log_member(env, struct_type, member, "bits_offset exceeds U32_MAX"); return -EINVAL; } struct_bits_off += BTF_INT_OFFSET(int_data); bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); nr_copy_bits = BTF_INT_BITS(int_data) + BITS_PER_BYTE_MASKED(struct_bits_off); if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, "nr_copy_bits exceeds 128"); return -EINVAL; } if (struct_size < bytes_offset || struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static int btf_int_check_kflag_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset; u32 int_data = btf_type_int(member_type); u32 struct_size = struct_type->size; u32 nr_copy_bits; /* a regular int type is required for the kflag int member */ if (!btf_type_int_is_regular(member_type)) { btf_verifier_log_member(env, struct_type, member, "Invalid member base type"); return -EINVAL; } /* check sanity of bitfield size */ nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); nr_int_data_bits = BTF_INT_BITS(int_data); if (!nr_bits) { /* Not a bitfield member, member offset must be at byte * boundary. */ if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Invalid member offset"); return -EINVAL; } nr_bits = nr_int_data_bits; } else if (nr_bits > nr_int_data_bits) { btf_verifier_log_member(env, struct_type, member, "Invalid member bitfield_size"); return -EINVAL; } bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, "nr_copy_bits exceeds 128"); return -EINVAL; } if (struct_size < bytes_offset || struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static s32 btf_int_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { u32 int_data, nr_bits, meta_needed = sizeof(int_data); u16 encoding; if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } int_data = btf_type_int(t); if (int_data & ~BTF_INT_MASK) { btf_verifier_log_basic(env, t, "Invalid int_data:%x", int_data); return -EINVAL; } nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); if (nr_bits > BITS_PER_U128) { btf_verifier_log_type(env, t, "nr_bits exceeds %zu", BITS_PER_U128); return -EINVAL; } if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) { btf_verifier_log_type(env, t, "nr_bits exceeds type_size"); return -EINVAL; } /* * Only one of the encoding bits is allowed and it * should be sufficient for the pretty print purpose (i.e. decoding). * Multiple bits can be allowed later if it is found * to be insufficient. */ encoding = BTF_INT_ENCODING(int_data); if (encoding && encoding != BTF_INT_SIGNED && encoding != BTF_INT_CHAR && encoding != BTF_INT_BOOL) { btf_verifier_log_type(env, t, "Unsupported encoding"); return -ENOTSUPP; } btf_verifier_log_type(env, t, NULL); return meta_needed; } static void btf_int_log(struct btf_verifier_env *env, const struct btf_type *t) { int int_data = btf_type_int(t); btf_verifier_log(env, "size=%u bits_offset=%u nr_bits=%u encoding=%s", t->size, BTF_INT_OFFSET(int_data), BTF_INT_BITS(int_data), btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } static void btf_int128_print(struct btf_show *show, void *data) { /* data points to a __int128 number. * Suppose * int128_num = *(__int128 *)data; * The below formulas shows what upper_num and lower_num represents: * upper_num = int128_num >> 64; * lower_num = int128_num & 0xffffffffFFFFFFFFULL; */ u64 upper_num, lower_num; #ifdef __BIG_ENDIAN_BITFIELD upper_num = *(u64 *)data; lower_num = *(u64 *)(data + 8); #else upper_num = *(u64 *)(data + 8); lower_num = *(u64 *)data; #endif if (upper_num == 0) btf_show_type_value(show, "0x%llx", lower_num); else btf_show_type_values(show, "0x%llx%016llx", upper_num, lower_num); } static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, u16 right_shift_bits) { u64 upper_num, lower_num; #ifdef __BIG_ENDIAN_BITFIELD upper_num = print_num[0]; lower_num = print_num[1]; #else upper_num = print_num[1]; lower_num = print_num[0]; #endif /* shake out un-needed bits by shift/or operations */ if (left_shift_bits >= 64) { upper_num = lower_num << (left_shift_bits - 64); lower_num = 0; } else { upper_num = (upper_num << left_shift_bits) | (lower_num >> (64 - left_shift_bits)); lower_num = lower_num << left_shift_bits; } if (right_shift_bits >= 64) { lower_num = upper_num >> (right_shift_bits - 64); upper_num = 0; } else { lower_num = (lower_num >> right_shift_bits) | (upper_num << (64 - right_shift_bits)); upper_num = upper_num >> right_shift_bits; } #ifdef __BIG_ENDIAN_BITFIELD print_num[0] = upper_num; print_num[1] = lower_num; #else print_num[0] = lower_num; print_num[1] = upper_num; #endif } static void btf_bitfield_show(void *data, u8 bits_offset, u8 nr_bits, struct btf_show *show) { u16 left_shift_bits, right_shift_bits; u8 nr_copy_bytes; u8 nr_copy_bits; u64 print_num[2] = {}; nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); memcpy(print_num, data, nr_copy_bytes); #ifdef __BIG_ENDIAN_BITFIELD left_shift_bits = bits_offset; #else left_shift_bits = BITS_PER_U128 - nr_copy_bits; #endif right_shift_bits = BITS_PER_U128 - nr_bits; btf_int128_shift(print_num, left_shift_bits, right_shift_bits); btf_int128_print(show, print_num); } static void btf_int_bits_show(const struct btf *btf, const struct btf_type *t, void *data, u8 bits_offset, struct btf_show *show) { u32 int_data = btf_type_int(t); u8 nr_bits = BTF_INT_BITS(int_data); u8 total_bits_offset; /* * bits_offset is at most 7. * BTF_INT_OFFSET() cannot exceed 128 bits. */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); btf_bitfield_show(data, bits_offset, nr_bits, show); } static void btf_int_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { u32 int_data = btf_type_int(t); u8 encoding = BTF_INT_ENCODING(int_data); bool sign = encoding & BTF_INT_SIGNED; u8 nr_bits = BTF_INT_BITS(int_data); void *safe_data; safe_data = btf_show_start_type(show, t, type_id, data); if (!safe_data) return; if (bits_offset || BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(nr_bits)) { btf_int_bits_show(btf, t, safe_data, bits_offset, show); goto out; } switch (nr_bits) { case 128: btf_int128_print(show, safe_data); break; case 64: if (sign) btf_show_type_value(show, "%lld", *(s64 *)safe_data); else btf_show_type_value(show, "%llu", *(u64 *)safe_data); break; case 32: if (sign) btf_show_type_value(show, "%d", *(s32 *)safe_data); else btf_show_type_value(show, "%u", *(u32 *)safe_data); break; case 16: if (sign) btf_show_type_value(show, "%d", *(s16 *)safe_data); else btf_show_type_value(show, "%u", *(u16 *)safe_data); break; case 8: if (show->state.array_encoding == BTF_INT_CHAR) { /* check for null terminator */ if (show->state.array_terminated) break; if (*(char *)data == '\0') { show->state.array_terminated = 1; break; } if (isprint(*(char *)data)) { btf_show_type_value(show, "'%c'", *(char *)safe_data); break; } } if (sign) btf_show_type_value(show, "%d", *(s8 *)safe_data); else btf_show_type_value(show, "%u", *(u8 *)safe_data); break; default: btf_int_bits_show(btf, t, safe_data, bits_offset, show); break; } out: btf_show_end_type(show); } static const struct btf_kind_operations int_ops = { .check_meta = btf_int_check_meta, .resolve = btf_df_resolve, .check_member = btf_int_check_member, .check_kflag_member = btf_int_check_kflag_member, .log_details = btf_int_log, .show = btf_int_show, }; static int btf_modifier_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { const struct btf_type *resolved_type; u32 resolved_type_id = member->type; struct btf_member resolved_member; struct btf *btf = env->btf; resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); if (!resolved_type) { btf_verifier_log_member(env, struct_type, member, "Invalid member"); return -EINVAL; } resolved_member = *member; resolved_member.type = resolved_type_id; return btf_type_ops(resolved_type)->check_member(env, struct_type, &resolved_member, resolved_type); } static int btf_modifier_check_kflag_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { const struct btf_type *resolved_type; u32 resolved_type_id = member->type; struct btf_member resolved_member; struct btf *btf = env->btf; resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); if (!resolved_type) { btf_verifier_log_member(env, struct_type, member, "Invalid member"); return -EINVAL; } resolved_member = *member; resolved_member.type = resolved_type_id; return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type, &resolved_member, resolved_type); } static int btf_ptr_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_size, struct_bits_off, bytes_offset; struct_size = struct_type->size; struct_bits_off = member->offset; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); return -EINVAL; } if (struct_size - bytes_offset < sizeof(void *)) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static int btf_ref_type_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const char *value; if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } if (!BTF_TYPE_ID_VALID(t->type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; } /* typedef/type_tag type must have a valid name, and other ref types, * volatile, const, restrict, should have a null name. */ if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) { if (!t->name_off || !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) { value = btf_name_by_offset(env->btf, t->name_off); if (!value || !value[0]) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } } else { if (t->name_off) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } } btf_verifier_log_type(env, t, NULL); return 0; } static int btf_modifier_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_type *t = v->t; const struct btf_type *next_type; u32 next_type_id = t->type; struct btf *btf = env->btf; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); /* Figure out the resolved next_type_id with size. * They will be stored in the current modifier's * resolved_ids and resolved_sizes such that it can * save us a few type-following when we use it later (e.g. in * pretty print). */ if (!btf_type_id_size(btf, &next_type_id, NULL)) { if (env_type_is_resolved(env, next_type_id)) next_type = btf_type_id_resolve(btf, &next_type_id); /* "typedef void new_void", "const void"...etc */ if (!btf_type_is_void(next_type) && !btf_type_is_fwd(next_type) && !btf_type_is_func_proto(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } } env_stack_pop_resolved(env, next_type_id, 0); return 0; } static int btf_var_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_type *next_type; const struct btf_type *t = v->t; u32 next_type_id = t->type; struct btf *btf = env->btf; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); if (btf_type_is_modifier(next_type)) { const struct btf_type *resolved_type; u32 resolved_type_id; resolved_type_id = next_type_id; resolved_type = btf_type_id_resolve(btf, &resolved_type_id); if (btf_type_is_ptr(resolved_type) && !env_type_is_resolve_sink(env, resolved_type) && !env_type_is_resolved(env, resolved_type_id)) return env_stack_push(env, resolved_type, resolved_type_id); } /* We must resolve to something concrete at this point, no * forward types or similar that would resolve to size of * zero is allowed. */ if (!btf_type_id_size(btf, &next_type_id, NULL)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } env_stack_pop_resolved(env, next_type_id, 0); return 0; } static int btf_ptr_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_type *next_type; const struct btf_type *t = v->t; u32 next_type_id = t->type; struct btf *btf = env->btf; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY, * the modifier may have stopped resolving when it was resolved * to a ptr (last-resolved-ptr). * * We now need to continue from the last-resolved-ptr to * ensure the last-resolved-ptr will not referring back to * the current ptr (t). */ if (btf_type_is_modifier(next_type)) { const struct btf_type *resolved_type; u32 resolved_type_id; resolved_type_id = next_type_id; resolved_type = btf_type_id_resolve(btf, &resolved_type_id); if (btf_type_is_ptr(resolved_type) && !env_type_is_resolve_sink(env, resolved_type) && !env_type_is_resolved(env, resolved_type_id)) return env_stack_push(env, resolved_type, resolved_type_id); } if (!btf_type_id_size(btf, &next_type_id, NULL)) { if (env_type_is_resolved(env, next_type_id)) next_type = btf_type_id_resolve(btf, &next_type_id); if (!btf_type_is_void(next_type) && !btf_type_is_fwd(next_type) && !btf_type_is_func_proto(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } } env_stack_pop_resolved(env, next_type_id, 0); return 0; } static void btf_modifier_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { if (btf->resolved_ids) t = btf_type_id_resolve(btf, &type_id); else t = btf_type_skip_modifiers(btf, type_id, NULL); btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show); } static void btf_var_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { t = btf_type_id_resolve(btf, &type_id); btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show); } static void btf_ptr_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { void *safe_data; safe_data = btf_show_start_type(show, t, type_id, data); if (!safe_data) return; /* It is a hashed value unless BTF_SHOW_PTR_RAW is specified */ if (show->flags & BTF_SHOW_PTR_RAW) btf_show_type_value(show, "0x%px", *(void **)safe_data); else btf_show_type_value(show, "0x%p", *(void **)safe_data); btf_show_end_type(show); } static void btf_ref_type_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "type_id=%u", t->type); } static const struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, .check_kflag_member = btf_modifier_check_kflag_member, .log_details = btf_ref_type_log, .show = btf_modifier_show, }; static const struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_ref_type_log, .show = btf_ptr_show, }; static s32 btf_fwd_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (t->type) { btf_verifier_log_type(env, t, "type != 0"); return -EINVAL; } /* fwd type must have a valid name */ if (!t->name_off || !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return 0; } static void btf_fwd_type_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct"); } static const struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_fwd_type_log, .show = btf_df_show, }; static int btf_array_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_bits_off = member->offset; u32 struct_size, bytes_offset; u32 array_type_id, array_size; struct btf *btf = env->btf; if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); return -EINVAL; } array_type_id = member->type; btf_type_id_size(btf, &array_type_id, &array_size); struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); if (struct_size - bytes_offset < array_size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static s32 btf_array_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_array *array = btf_type_array(t); u32 meta_needed = sizeof(*array); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } /* array type should not have a name */ if (t->name_off) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } if (t->size) { btf_verifier_log_type(env, t, "size != 0"); return -EINVAL; } /* Array elem type and index type cannot be in type void, * so !array->type and !array->index_type are not allowed. */ if (!array->type || !BTF_TYPE_ID_VALID(array->type)) { btf_verifier_log_type(env, t, "Invalid elem"); return -EINVAL; } if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) { btf_verifier_log_type(env, t, "Invalid index"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return meta_needed; } static int btf_array_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_array *array = btf_type_array(v->t); const struct btf_type *elem_type, *index_type; u32 elem_type_id, index_type_id; struct btf *btf = env->btf; u32 elem_size; /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); if (btf_type_nosize_or_null(index_type) || btf_type_is_resolve_source_only(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } if (!env_type_is_resolve_sink(env, index_type) && !env_type_is_resolved(env, index_type_id)) return env_stack_push(env, index_type, index_type_id); index_type = btf_type_id_size(btf, &index_type_id, NULL); if (!index_type || !btf_type_is_int(index_type) || !btf_type_int_is_regular(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); if (btf_type_nosize_or_null(elem_type) || btf_type_is_resolve_source_only(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; } if (!env_type_is_resolve_sink(env, elem_type) && !env_type_is_resolved(env, elem_type_id)) return env_stack_push(env, elem_type, elem_type_id); elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); if (!elem_type) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; } if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid array of int"); return -EINVAL; } if (array->nelems && elem_size > U32_MAX / array->nelems) { btf_verifier_log_type(env, v->t, "Array size overflows U32_MAX"); return -EINVAL; } env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems); return 0; } static void btf_array_log(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_array *array = btf_type_array(t); btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u", array->type, array->index_type, array->nelems); } static void __btf_array_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_array *array = btf_type_array(t); const struct btf_kind_operations *elem_ops; const struct btf_type *elem_type; u32 i, elem_size = 0, elem_type_id; u16 encoding = 0; elem_type_id = array->type; elem_type = btf_type_skip_modifiers(btf, elem_type_id, NULL); if (elem_type && btf_type_has_size(elem_type)) elem_size = elem_type->size; if (elem_type && btf_type_is_int(elem_type)) { u32 int_type = btf_type_int(elem_type); encoding = BTF_INT_ENCODING(int_type); /* * BTF_INT_CHAR encoding never seems to be set for * char arrays, so if size is 1 and element is * printable as a char, we'll do that. */ if (elem_size == 1) encoding = BTF_INT_CHAR; } if (!btf_show_start_array_type(show, t, type_id, encoding, data)) return; if (!elem_type) goto out; elem_ops = btf_type_ops(elem_type); for (i = 0; i < array->nelems; i++) { btf_show_start_array_member(show); elem_ops->show(btf, elem_type, elem_type_id, data, bits_offset, show); data += elem_size; btf_show_end_array_member(show); if (show->state.array_terminated) break; } out: btf_show_end_array_type(show); } static void btf_array_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_member *m = show->state.member; /* * First check if any members would be shown (are non-zero). * See comments above "struct btf_show" definition for more * details on how this works at a high-level. */ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) { if (!show->state.depth_check) { show->state.depth_check = show->state.depth + 1; show->state.depth_to_show = 0; } __btf_array_show(btf, t, type_id, data, bits_offset, show); show->state.member = m; if (show->state.depth_check != show->state.depth + 1) return; show->state.depth_check = 0; if (show->state.depth_to_show <= show->state.depth) return; /* * Reaching here indicates we have recursed and found * non-zero array member(s). */ } __btf_array_show(btf, t, type_id, data, bits_offset, show); } static const struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_array_log, .show = btf_array_show, }; static int btf_struct_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_bits_off = member->offset; u32 struct_size, bytes_offset; if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); return -EINVAL; } struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static s32 btf_struct_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; const struct btf_member *member; u32 meta_needed, last_offset; struct btf *btf = env->btf; u32 struct_size = t->size; u32 offset; u16 i; meta_needed = btf_type_vlen(t) * sizeof(*member); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } /* struct type either no name or a valid one */ if (t->name_off && !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); last_offset = 0; for_each_member(i, t, member) { if (!btf_name_offset_valid(btf, member->name_off)) { btf_verifier_log_member(env, t, member, "Invalid member name_offset:%u", member->name_off); return -EINVAL; } /* struct member either no name or a valid one */ if (member->name_off && !btf_name_valid_identifier(btf, member->name_off)) { btf_verifier_log_member(env, t, member, "Invalid name"); return -EINVAL; } /* A member cannot be in type void */ if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { btf_verifier_log_member(env, t, member, "Invalid type_id"); return -EINVAL; } offset = __btf_member_bit_offset(t, member); if (is_union && offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; } /* * ">" instead of ">=" because the last member could be * "char a[0];" */ if (last_offset > offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; } if (BITS_ROUNDUP_BYTES(offset) > struct_size) { btf_verifier_log_member(env, t, member, "Member bits_offset exceeds its struct size"); return -EINVAL; } btf_verifier_log_member(env, t, member, NULL); last_offset = offset; } return meta_needed; } static int btf_struct_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_member *member; int err; u16 i; /* Before continue resolving the next_member, * ensure the last member is indeed resolved to a * type with size info. */ if (v->next_member) { const struct btf_type *last_member_type; const struct btf_member *last_member; u32 last_member_type_id; last_member = btf_type_member(v->t) + v->next_member - 1; last_member_type_id = last_member->type; if (WARN_ON_ONCE(!env_type_is_resolved(env, last_member_type_id))) return -EINVAL; last_member_type = btf_type_by_id(env->btf, last_member_type_id); if (btf_type_kflag(v->t)) err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t, last_member, last_member_type); else err = btf_type_ops(last_member_type)->check_member(env, v->t, last_member, last_member_type); if (err) return err; } for_each_member_from(i, v->next_member, v->t, member) { u32 member_type_id = member->type; const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); if (btf_type_nosize_or_null(member_type) || btf_type_is_resolve_source_only(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; } if (!env_type_is_resolve_sink(env, member_type) && !env_type_is_resolved(env, member_type_id)) { env_stack_set_next_member(env, i + 1); return env_stack_push(env, member_type, member_type_id); } if (btf_type_kflag(v->t)) err = btf_type_ops(member_type)->check_kflag_member(env, v->t, member, member_type); else err = btf_type_ops(member_type)->check_member(env, v->t, member, member_type); if (err) return err; } env_stack_pop_resolved(env, 0, 0); return 0; } static void btf_struct_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } enum { BTF_FIELD_IGNORE = 0, BTF_FIELD_FOUND = 1, }; struct btf_field_info { enum btf_field_type type; u32 off; union { struct { u32 type_id; } kptr; struct { const char *node_name; u32 value_btf_id; } graph_root; }; }; static int btf_find_struct(const struct btf *btf, const struct btf_type *t, u32 off, int sz, enum btf_field_type field_type, struct btf_field_info *info) { if (!__btf_type_is_struct(t)) return BTF_FIELD_IGNORE; if (t->size != sz) return BTF_FIELD_IGNORE; info->type = field_type; info->off = off; return BTF_FIELD_FOUND; } static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info, u32 field_mask) { enum btf_field_type type; u32 res_id; /* Permit modifiers on the pointer itself */ if (btf_type_is_volatile(t)) t = btf_type_by_id(btf, t->type); /* For PTR, sz is always == 8 */ if (!btf_type_is_ptr(t)) return BTF_FIELD_IGNORE; t = btf_type_by_id(btf, t->type); if (!btf_type_is_type_tag(t)) return BTF_FIELD_IGNORE; /* Reject extra tags */ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) return -EINVAL; if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_UNREF; else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_REF; else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_PERCPU; else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_UPTR; else return -EINVAL; if (!(type & field_mask)) return BTF_FIELD_IGNORE; /* Get the base type */ t = btf_type_skip_modifiers(btf, t->type, &res_id); /* Only pointer to struct is allowed */ if (!__btf_type_is_struct(t)) return -EINVAL; info->type = type; info->off = off; info->kptr.type_id = res_id; return BTF_FIELD_FOUND; } int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt, int comp_idx, const char *tag_key, int last_id) { int len = strlen(tag_key); int i, n; for (i = last_id + 1, n = btf_nr_types(btf); i < n; i++) { const struct btf_type *t = btf_type_by_id(btf, i); if (!btf_type_is_decl_tag(t)) continue; if (pt != btf_type_by_id(btf, t->type)) continue; if (btf_type_decl_tag(t)->component_idx != comp_idx) continue; if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len)) continue; return i; } return -ENOENT; } const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, int comp_idx, const char *tag_key) { const char *value = NULL; const struct btf_type *t; int len, id; id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0); if (id < 0) return ERR_PTR(id); t = btf_type_by_id(btf, id); len = strlen(tag_key); value = __btf_name_by_offset(btf, t->name_off) + len; /* Prevent duplicate entries for same type */ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, id); if (id >= 0) return ERR_PTR(-EEXIST); return value; } static int btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, const struct btf_type *t, int comp_idx, u32 off, int sz, struct btf_field_info *info, enum btf_field_type head_type) { const char *node_field_name; const char *value_type; s32 id; if (!__btf_type_is_struct(t)) return BTF_FIELD_IGNORE; if (t->size != sz) return BTF_FIELD_IGNORE; value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:"); if (IS_ERR(value_type)) return -EINVAL; node_field_name = strstr(value_type, ":"); if (!node_field_name) return -EINVAL; value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN); if (!value_type) return -ENOMEM; id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT); kfree(value_type); if (id < 0) return id; node_field_name++; if (str_is_empty(node_field_name)) return -EINVAL; info->type = head_type; info->off = off; info->graph_root.value_btf_id = id; info->graph_root.node_name = node_field_name; return BTF_FIELD_FOUND; } #define field_mask_test_name(field_type, field_type_str) \ if (field_mask & field_type && !strcmp(name, field_type_str)) { \ type = field_type; \ goto end; \ } static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type, u32 field_mask, u32 *seen_mask, int *align, int *sz) { int type = 0; const char *name = __btf_name_by_offset(btf, var_type->name_off); if (field_mask & BPF_SPIN_LOCK) { if (!strcmp(name, "bpf_spin_lock")) { if (*seen_mask & BPF_SPIN_LOCK) return -E2BIG; *seen_mask |= BPF_SPIN_LOCK; type = BPF_SPIN_LOCK; goto end; } } if (field_mask & BPF_TIMER) { if (!strcmp(name, "bpf_timer")) { if (*seen_mask & BPF_TIMER) return -E2BIG; *seen_mask |= BPF_TIMER; type = BPF_TIMER; goto end; } } if (field_mask & BPF_WORKQUEUE) { if (!strcmp(name, "bpf_wq")) { if (*seen_mask & BPF_WORKQUEUE) return -E2BIG; *seen_mask |= BPF_WORKQUEUE; type = BPF_WORKQUEUE; goto end; } } field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); field_mask_test_name(BPF_RB_NODE, "bpf_rb_node"); field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); /* Only return BPF_KPTR when all other types with matchable names fail */ if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { type = BPF_KPTR_REF; goto end; } return 0; end: *sz = btf_field_type_size(type); *align = btf_field_type_align(type); return type; } #undef field_mask_test_name /* Repeat a number of fields for a specified number of times. * * Copy the fields starting from the first field and repeat them for * repeat_cnt times. The fields are repeated by adding the offset of each * field with * (i + 1) * elem_size * where i is the repeat index and elem_size is the size of an element. */ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { u32 i, j; u32 cur; /* Ensure not repeating fields that should not be repeated. */ for (i = 0; i < field_cnt; i++) { switch (info[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: case BPF_LIST_HEAD: case BPF_RB_ROOT: break; default: return -EINVAL; } } /* The type of struct size or variable size is u32, * so the multiplication will not overflow. */ if (field_cnt * (repeat_cnt + 1) > info_cnt) return -E2BIG; cur = field_cnt; for (i = 0; i < repeat_cnt; i++) { memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0])); for (j = 0; j < field_cnt; j++) info[cur++].off += (i + 1) * elem_size; } return 0; } static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt, u32 level); /* Find special fields in the struct type of a field. * * This function is used to find fields of special types that is not a * global variable or a direct field of a struct type. It also handles the * repetition if it is the element type of an array. */ static int btf_find_nested_struct(const struct btf *btf, const struct btf_type *t, u32 off, u32 nelems, u32 field_mask, struct btf_field_info *info, int info_cnt, u32 level) { int ret, err, i; level++; if (level >= MAX_RESOLVE_DEPTH) return -E2BIG; ret = btf_find_struct_field(btf, t, field_mask, info, info_cnt, level); if (ret <= 0) return ret; /* Shift the offsets of the nested struct fields to the offsets * related to the container. */ for (i = 0; i < ret; i++) info[i].off += off; if (nelems > 1) { err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size); if (err == 0) ret *= nelems; else ret = err; } return ret; } static int btf_find_field_one(const struct btf *btf, const struct btf_type *var, const struct btf_type *var_type, int var_idx, u32 off, u32 expected_size, u32 field_mask, u32 *seen_mask, struct btf_field_info *info, int info_cnt, u32 level) { int ret, align, sz, field_type; struct btf_field_info tmp; const struct btf_array *array; u32 i, nelems = 1; /* Walk into array types to find the element type and the number of * elements in the (flattened) array. */ for (i = 0; i < MAX_RESOLVE_DEPTH && btf_type_is_array(var_type); i++) { array = btf_array(var_type); nelems *= array->nelems; var_type = btf_type_by_id(btf, array->type); } if (i == MAX_RESOLVE_DEPTH) return -E2BIG; if (nelems == 0) return 0; field_type = btf_get_field_type(btf, var_type, field_mask, seen_mask, &align, &sz); /* Look into variables of struct types */ if (!field_type && __btf_type_is_struct(var_type)) { sz = var_type->size; if (expected_size && expected_size != sz * nelems) return 0; ret = btf_find_nested_struct(btf, var_type, off, nelems, field_mask, &info[0], info_cnt, level); return ret; } if (field_type == 0) return 0; if (field_type < 0) return field_type; if (expected_size && expected_size != sz * nelems) return 0; if (off % align) return 0; switch (field_type) { case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: ret = btf_find_struct(btf, var_type, off, sz, field_type, info_cnt ? &info[0] : &tmp); if (ret < 0) return ret; break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: ret = btf_find_kptr(btf, var_type, off, sz, info_cnt ? &info[0] : &tmp, field_mask); if (ret < 0) return ret; break; case BPF_LIST_HEAD: case BPF_RB_ROOT: ret = btf_find_graph_root(btf, var, var_type, var_idx, off, sz, info_cnt ? &info[0] : &tmp, field_type); if (ret < 0) return ret; break; default: return -EFAULT; } if (ret == BTF_FIELD_IGNORE) return 0; if (!info_cnt) return -E2BIG; if (nelems > 1) { ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz); if (ret < 0) return ret; } return nelems; } static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt, u32 level) { int ret, idx = 0; const struct btf_member *member; u32 i, off, seen_mask = 0; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); off = __btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ return -EINVAL; off /= 8; ret = btf_find_field_one(btf, t, member_type, i, off, 0, field_mask, &seen_mask, &info[idx], info_cnt - idx, level); if (ret < 0) return ret; idx += ret; } return idx; } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt, u32 level) { int ret, idx = 0; const struct btf_var_secinfo *vsi; u32 i, off, seen_mask = 0; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); off = vsi->offset; ret = btf_find_field_one(btf, var, var_type, -1, off, vsi->size, field_mask, &seen_mask, &info[idx], info_cnt - idx, level); if (ret < 0) return ret; idx += ret; } return idx; } static int btf_find_field(const struct btf *btf, const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt) { if (__btf_type_is_struct(t)) return btf_find_struct_field(btf, t, field_mask, info, info_cnt, 0); else if (btf_type_is_datasec(t)) return btf_find_datasec_var(btf, t, field_mask, info, info_cnt, 0); return -EINVAL; } /* Callers have to ensure the life cycle of btf if it is program BTF */ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, struct btf_field_info *info) { struct module *mod = NULL; const struct btf_type *t; /* If a matching btf type is found in kernel or module BTFs, kptr_ref * is that BTF, otherwise it's program BTF */ struct btf *kptr_btf; int ret; s32 id; /* Find type in map BTF, and use it to look up the matching type * in vmlinux or module BTFs, by name and kind. */ t = btf_type_by_id(btf, info->kptr.type_id); id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), &kptr_btf); if (id == -ENOENT) { /* btf_parse_kptr should only be called w/ btf = program BTF */ WARN_ON_ONCE(btf_is_kernel(btf)); /* Type exists only in program BTF. Assume that it's a MEM_ALLOC * kptr allocated via bpf_obj_new */ field->kptr.dtor = NULL; id = info->kptr.type_id; kptr_btf = (struct btf *)btf; goto found_dtor; } if (id < 0) return id; /* Find and stash the function pointer for the destruction function that * needs to be eventually invoked from the map free path. */ if (info->type == BPF_KPTR_REF) { const struct btf_type *dtor_func; const char *dtor_func_name; unsigned long addr; s32 dtor_btf_id; /* This call also serves as a whitelist of allowed objects that * can be used as a referenced pointer and be stored in a map at * the same time. */ dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id); if (dtor_btf_id < 0) { ret = dtor_btf_id; goto end_btf; } dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id); if (!dtor_func) { ret = -ENOENT; goto end_btf; } if (btf_is_module(kptr_btf)) { mod = btf_try_get_module(kptr_btf); if (!mod) { ret = -ENXIO; goto end_btf; } } /* We already verified dtor_func to be btf_type_is_func * in register_btf_id_dtor_kfuncs. */ dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off); addr = kallsyms_lookup_name(dtor_func_name); if (!addr) { ret = -EINVAL; goto end_mod; } field->kptr.dtor = (void *)addr; } found_dtor: field->kptr.btf_id = id; field->kptr.btf = kptr_btf; field->kptr.module = mod; return 0; end_mod: module_put(mod); end_btf: btf_put(kptr_btf); return ret; } static int btf_parse_graph_root(const struct btf *btf, struct btf_field *field, struct btf_field_info *info, const char *node_type_name, size_t node_type_align) { const struct btf_type *t, *n = NULL; const struct btf_member *member; u32 offset; int i; t = btf_type_by_id(btf, info->graph_root.value_btf_id); /* We've already checked that value_btf_id is a struct type. We * just need to figure out the offset of the list_node, and * verify its type. */ for_each_member(i, t, member) { if (strcmp(info->graph_root.node_name, __btf_name_by_offset(btf, member->name_off))) continue; /* Invalid BTF, two members with same name */ if (n) return -EINVAL; n = btf_type_by_id(btf, member->type); if (!__btf_type_is_struct(n)) return -EINVAL; if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off))) return -EINVAL; offset = __btf_member_bit_offset(n, member); if (offset % 8) return -EINVAL; offset /= 8; if (offset % node_type_align) return -EINVAL; field->graph_root.btf = (struct btf *)btf; field->graph_root.value_btf_id = info->graph_root.value_btf_id; field->graph_root.node_offset = offset; } if (!n) return -ENOENT; return 0; } static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, struct btf_field_info *info) { return btf_parse_graph_root(btf, field, info, "bpf_list_node", __alignof__(struct bpf_list_node)); } static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field, struct btf_field_info *info) { return btf_parse_graph_root(btf, field, info, "bpf_rb_node", __alignof__(struct bpf_rb_node)); } static int btf_field_cmp(const void *_a, const void *_b, const void *priv) { const struct btf_field *a = (const struct btf_field *)_a; const struct btf_field *b = (const struct btf_field *)_b; if (a->offset < b->offset) return -1; else if (a->offset > b->offset) return 1; return 0; } struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size) { struct btf_field_info info_arr[BTF_FIELDS_MAX]; u32 next_off = 0, field_type_size; struct btf_record *rec; int ret, i, cnt; ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr)); if (ret < 0) return ERR_PTR(ret); if (!ret) return NULL; cnt = ret; /* This needs to be kzalloc to zero out padding and unused fields, see * comment in btf_record_equal. */ rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); rec->spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size); ret = -EFAULT; goto end; } if (info_arr[i].off < next_off) { ret = -EEXIST; goto end; } next_off = info_arr[i].off + field_type_size; rec->field_mask |= info_arr[i].type; rec->fields[i].offset = info_arr[i].off; rec->fields[i].type = info_arr[i].type; rec->fields[i].size = field_type_size; switch (info_arr[i].type) { case BPF_SPIN_LOCK: WARN_ON_ONCE(rec->spin_lock_off >= 0); /* Cache offset for faster lookup at runtime */ rec->spin_lock_off = rec->fields[i].offset; break; case BPF_TIMER: WARN_ON_ONCE(rec->timer_off >= 0); /* Cache offset for faster lookup at runtime */ rec->timer_off = rec->fields[i].offset; break; case BPF_WORKQUEUE: WARN_ON_ONCE(rec->wq_off >= 0); /* Cache offset for faster lookup at runtime */ rec->wq_off = rec->fields[i].offset; break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ rec->refcount_off = rec->fields[i].offset; break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; break; case BPF_LIST_HEAD: ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; break; case BPF_RB_ROOT: ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; break; case BPF_LIST_NODE: case BPF_RB_NODE: break; default: ret = -EFAULT; goto end; } rec->cnt++; } /* bpf_{list_head, rb_node} require bpf_spin_lock */ if ((btf_record_has_field(rec, BPF_LIST_HEAD) || btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) { ret = -EINVAL; goto end; } if (rec->refcount_off < 0 && btf_record_has_field(rec, BPF_LIST_NODE) && btf_record_has_field(rec, BPF_RB_NODE)) { ret = -EINVAL; goto end; } sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp, NULL, rec); return rec; end: btf_record_free(rec); return ERR_PTR(ret); } int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) { int i; /* There are three types that signify ownership of some other type: * kptr_ref, bpf_list_head, bpf_rb_root. * kptr_ref only supports storing kernel types, which can't store * references to program allocated local types. * * Hence we only need to ensure that bpf_{list_head,rb_root} ownership * does not form cycles. */ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR))) return 0; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *meta; const struct btf_type *t; u32 btf_id; if (rec->fields[i].type == BPF_UPTR) { /* The uptr only supports pinning one page and cannot * point to a kernel struct */ if (btf_is_kernel(rec->fields[i].kptr.btf)) return -EINVAL; t = btf_type_by_id(rec->fields[i].kptr.btf, rec->fields[i].kptr.btf_id); if (!t->size) return -EINVAL; if (t->size > PAGE_SIZE) return -E2BIG; continue; } if (!(rec->fields[i].type & BPF_GRAPH_ROOT)) continue; btf_id = rec->fields[i].graph_root.value_btf_id; meta = btf_find_struct_meta(btf, btf_id); if (!meta) return -EFAULT; rec->fields[i].graph_root.value_rec = meta->record; /* We need to set value_rec for all root types, but no need * to check ownership cycle for a type unless it's also a * node type. */ if (!(rec->field_mask & BPF_GRAPH_NODE)) continue; /* We need to ensure ownership acyclicity among all types. The * proper way to do it would be to topologically sort all BTF * IDs based on the ownership edges, since there can be multiple * bpf_{list_head,rb_node} in a type. Instead, we use the * following resaoning: * * - A type can only be owned by another type in user BTF if it * has a bpf_{list,rb}_node. Let's call these node types. * - A type can only _own_ another type in user BTF if it has a * bpf_{list_head,rb_root}. Let's call these root types. * * We ensure that if a type is both a root and node, its * element types cannot be root types. * * To ensure acyclicity: * * When A is an root type but not a node, its ownership * chain can be: * A -> B -> C * Where: * - A is an root, e.g. has bpf_rb_root. * - B is both a root and node, e.g. has bpf_rb_node and * bpf_list_head. * - C is only an root, e.g. has bpf_list_node * * When A is both a root and node, some other type already * owns it in the BTF domain, hence it can not own * another root type through any of the ownership edges. * A -> B * Where: * - A is both an root and node. * - B is only an node. */ if (meta->record->field_mask & BPF_GRAPH_ROOT) return -ELOOP; } return 0; } static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_member *member; void *safe_data; u32 i; safe_data = btf_show_start_struct_type(show, t, type_id, data); if (!safe_data) return; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); const struct btf_kind_operations *ops; u32 member_offset, bitfield_size; u32 bytes_offset; u8 bits8_offset; btf_show_start_member(show, member); member_offset = __btf_member_bit_offset(t, member); bitfield_size = __btf_member_bitfield_size(t, member); bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { safe_data = btf_show_start_type(show, member_type, member->type, data + bytes_offset); if (safe_data) btf_bitfield_show(safe_data, bits8_offset, bitfield_size, show); btf_show_end_type(show); } else { ops = btf_type_ops(member_type); ops->show(btf, member_type, member->type, data + bytes_offset, bits8_offset, show); } btf_show_end_member(show); } btf_show_end_struct_type(show); } static void btf_struct_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_member *m = show->state.member; /* * First check if any members would be shown (are non-zero). * See comments above "struct btf_show" definition for more * details on how this works at a high-level. */ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) { if (!show->state.depth_check) { show->state.depth_check = show->state.depth + 1; show->state.depth_to_show = 0; } __btf_struct_show(btf, t, type_id, data, bits_offset, show); /* Restore saved member data here */ show->state.member = m; if (show->state.depth_check != show->state.depth + 1) return; show->state.depth_check = 0; if (show->state.depth_to_show <= show->state.depth) return; /* * Reaching here indicates we have recursed and found * non-zero child values. */ } __btf_struct_show(btf, t, type_id, data, bits_offset, show); } static const struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_struct_log, .show = btf_struct_show, }; static int btf_enum_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_bits_off = member->offset; u32 struct_size, bytes_offset; if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); return -EINVAL; } struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static int btf_enum_check_kflag_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_bits_off, nr_bits, bytes_end, struct_size; u32 int_bitsize = sizeof(int) * BITS_PER_BYTE; struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); if (!nr_bits) { if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); return -EINVAL; } nr_bits = int_bitsize; } else if (nr_bits > int_bitsize) { btf_verifier_log_member(env, struct_type, member, "Invalid member bitfield_size"); return -EINVAL; } struct_size = struct_type->size; bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits); if (struct_size < bytes_end) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static s32 btf_enum_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_enum *enums = btf_type_enum(t); struct btf *btf = env->btf; const char *fmt_str; u16 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); meta_needed = nr_enums * sizeof(*enums); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (t->size > 8 || !is_power_of_2(t->size)) { btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; } /* enum type either no name or a valid one */ if (t->name_off && !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); for (i = 0; i < nr_enums; i++) { if (!btf_name_offset_valid(btf, enums[i].name_off)) { btf_verifier_log(env, "\tInvalid name_offset:%u", enums[i].name_off); return -EINVAL; } /* enum member must have a valid name */ if (!enums[i].name_off || !btf_name_valid_identifier(btf, enums[i].name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } if (env->log.level == BPF_LOG_KERNEL) continue; fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n"; btf_verifier_log(env, fmt_str, __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } return meta_needed; } static void btf_enum_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } static void btf_enum_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_enum *enums = btf_type_enum(t); u32 i, nr_enums = btf_type_vlen(t); void *safe_data; int v; safe_data = btf_show_start_type(show, t, type_id, data); if (!safe_data) return; v = *(int *)safe_data; for (i = 0; i < nr_enums; i++) { if (v != enums[i].val) continue; btf_show_type_value(show, "%s", __btf_name_by_offset(btf, enums[i].name_off)); btf_show_end_type(show); return; } if (btf_type_kflag(t)) btf_show_type_value(show, "%d", v); else btf_show_type_value(show, "%u", v); btf_show_end_type(show); } static const struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, .check_kflag_member = btf_enum_check_kflag_member, .log_details = btf_enum_log, .show = btf_enum_show, }; static s32 btf_enum64_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_enum64 *enums = btf_type_enum64(t); struct btf *btf = env->btf; const char *fmt_str; u16 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); meta_needed = nr_enums * sizeof(*enums); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (t->size > 8 || !is_power_of_2(t->size)) { btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; } /* enum type either no name or a valid one */ if (t->name_off && !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); for (i = 0; i < nr_enums; i++) { if (!btf_name_offset_valid(btf, enums[i].name_off)) { btf_verifier_log(env, "\tInvalid name_offset:%u", enums[i].name_off); return -EINVAL; } /* enum member must have a valid name */ if (!enums[i].name_off || !btf_name_valid_identifier(btf, enums[i].name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } if (env->log.level == BPF_LOG_KERNEL) continue; fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n"; btf_verifier_log(env, fmt_str, __btf_name_by_offset(btf, enums[i].name_off), btf_enum64_value(enums + i)); } return meta_needed; } static void btf_enum64_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_enum64 *enums = btf_type_enum64(t); u32 i, nr_enums = btf_type_vlen(t); void *safe_data; s64 v; safe_data = btf_show_start_type(show, t, type_id, data); if (!safe_data) return; v = *(u64 *)safe_data; for (i = 0; i < nr_enums; i++) { if (v != btf_enum64_value(enums + i)) continue; btf_show_type_value(show, "%s", __btf_name_by_offset(btf, enums[i].name_off)); btf_show_end_type(show); return; } if (btf_type_kflag(t)) btf_show_type_value(show, "%lld", v); else btf_show_type_value(show, "%llu", v); btf_show_end_type(show); } static const struct btf_kind_operations enum64_ops = { .check_meta = btf_enum64_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, .check_kflag_member = btf_enum_check_kflag_member, .log_details = btf_enum_log, .show = btf_enum64_show, }; static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (t->name_off) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return meta_needed; } static void btf_func_proto_log(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_param *args = (const struct btf_param *)(t + 1); u16 nr_args = btf_type_vlen(t), i; btf_verifier_log(env, "return=%u args=(", t->type); if (!nr_args) { btf_verifier_log(env, "void"); goto done; } if (nr_args == 1 && !args[0].type) { /* Only one vararg */ btf_verifier_log(env, "vararg"); goto done; } btf_verifier_log(env, "%u %s", args[0].type, __btf_name_by_offset(env->btf, args[0].name_off)); for (i = 1; i < nr_args - 1; i++) btf_verifier_log(env, ", %u %s", args[i].type, __btf_name_by_offset(env->btf, args[i].name_off)); if (nr_args > 1) { const struct btf_param *last_arg = &args[nr_args - 1]; if (last_arg->type) btf_verifier_log(env, ", %u %s", last_arg->type, __btf_name_by_offset(env->btf, last_arg->name_off)); else btf_verifier_log(env, ", vararg"); } done: btf_verifier_log(env, ")"); } static const struct btf_kind_operations func_proto_ops = { .check_meta = btf_func_proto_check_meta, .resolve = btf_df_resolve, /* * BTF_KIND_FUNC_PROTO cannot be directly referred by * a struct's member. * * It should be a function pointer instead. * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO) * * Hence, there is no btf_func_check_member(). */ .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_func_proto_log, .show = btf_df_show, }; static s32 btf_func_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { if (!t->name_off || !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) { btf_verifier_log_type(env, t, "Invalid func linkage"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return 0; } static int btf_func_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_type *t = v->t; u32 next_type_id = t->type; int err; err = btf_func_check(env, t); if (err) return err; env_stack_pop_resolved(env, next_type_id, 0); return 0; } static const struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, .resolve = btf_func_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, .show = btf_df_show, }; static s32 btf_var_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_var *var; u32 meta_needed = sizeof(*var); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } if (!t->name_off || !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } /* A var cannot be in type void */ if (!t->type || !BTF_TYPE_ID_VALID(t->type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; } var = btf_type_var(t); if (var->linkage != BTF_VAR_STATIC && var->linkage != BTF_VAR_GLOBAL_ALLOCATED) { btf_verifier_log_type(env, t, "Linkage not supported"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return meta_needed; } static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_var *var = btf_type_var(t); btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage); } static const struct btf_kind_operations var_ops = { .check_meta = btf_var_check_meta, .resolve = btf_var_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_var_log, .show = btf_var_show, }; static s32 btf_datasec_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_var_secinfo *vsi; u64 last_vsi_end_off = 0, sum = 0; u32 i, meta_needed; meta_needed = btf_type_vlen(t) * sizeof(*vsi); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (!t->size) { btf_verifier_log_type(env, t, "size == 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } if (!t->name_off || !btf_name_valid_section(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); for_each_vsi(i, t, vsi) { /* A var cannot be in type void */ if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) { btf_verifier_log_vsi(env, t, vsi, "Invalid type_id"); return -EINVAL; } if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) { btf_verifier_log_vsi(env, t, vsi, "Invalid offset"); return -EINVAL; } if (!vsi->size || vsi->size > t->size) { btf_verifier_log_vsi(env, t, vsi, "Invalid size"); return -EINVAL; } last_vsi_end_off = vsi->offset + vsi->size; if (last_vsi_end_off > t->size) { btf_verifier_log_vsi(env, t, vsi, "Invalid offset+size"); return -EINVAL; } btf_verifier_log_vsi(env, t, vsi, NULL); sum += vsi->size; } if (t->size < sum) { btf_verifier_log_type(env, t, "Invalid btf_info size"); return -EINVAL; } return meta_needed; } static int btf_datasec_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_var_secinfo *vsi; struct btf *btf = env->btf; u16 i; env->resolve_mode = RESOLVE_TBD; for_each_vsi_from(i, v->next_member, v->t, vsi) { u32 var_type_id = vsi->type, type_id, type_size = 0; const struct btf_type *var_type = btf_type_by_id(env->btf, var_type_id); if (!var_type || !btf_type_is_var(var_type)) { btf_verifier_log_vsi(env, v->t, vsi, "Not a VAR kind member"); return -EINVAL; } if (!env_type_is_resolve_sink(env, var_type) && !env_type_is_resolved(env, var_type_id)) { env_stack_set_next_member(env, i + 1); return env_stack_push(env, var_type, var_type_id); } type_id = var_type->type; if (!btf_type_id_size(btf, &type_id, &type_size)) { btf_verifier_log_vsi(env, v->t, vsi, "Invalid type"); return -EINVAL; } if (vsi->size < type_size) { btf_verifier_log_vsi(env, v->t, vsi, "Invalid size"); return -EINVAL; } } env_stack_pop_resolved(env, 0, 0); return 0; } static void btf_datasec_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } static void btf_datasec_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_var_secinfo *vsi; const struct btf_type *var; u32 i; if (!btf_show_start_type(show, t, type_id, data)) return; btf_show_type_value(show, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); for_each_vsi(i, t, vsi) { var = btf_type_by_id(btf, vsi->type); if (i) btf_show(show, ","); btf_type_ops(var)->show(btf, var, vsi->type, data + vsi->offset, bits_offset, show); } btf_show_end_type(show); } static const struct btf_kind_operations datasec_ops = { .check_meta = btf_datasec_check_meta, .resolve = btf_datasec_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_datasec_log, .show = btf_datasec_show, }; static s32 btf_float_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 && t->size != 16) { btf_verifier_log_type(env, t, "Invalid type_size"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return 0; } static int btf_float_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u64 start_offset_bytes; u64 end_offset_bytes; u64 misalign_bits; u64 align_bytes; u64 align_bits; /* Different architectures have different alignment requirements, so * here we check only for the reasonable minimum. This way we ensure * that types after CO-RE can pass the kernel BTF verifier. */ align_bytes = min_t(u64, sizeof(void *), member_type->size); align_bits = align_bytes * BITS_PER_BYTE; div64_u64_rem(member->offset, align_bits, &misalign_bits); if (misalign_bits) { btf_verifier_log_member(env, struct_type, member, "Member is not properly aligned"); return -EINVAL; } start_offset_bytes = member->offset / BITS_PER_BYTE; end_offset_bytes = start_offset_bytes + member_type->size; if (end_offset_bytes > struct_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static void btf_float_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "size=%u", t->size); } static const struct btf_kind_operations float_ops = { .check_meta = btf_float_check_meta, .resolve = btf_df_resolve, .check_member = btf_float_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_float_log, .show = btf_df_show, }; static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_decl_tag *tag; u32 meta_needed = sizeof(*tag); s32 component_idx; const char *value; if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } value = btf_name_by_offset(env->btf, t->name_off); if (!value || !value[0]) { btf_verifier_log_type(env, t, "Invalid value"); return -EINVAL; } if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } component_idx = btf_type_decl_tag(t)->component_idx; if (component_idx < -1) { btf_verifier_log_type(env, t, "Invalid component_idx"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return meta_needed; } static int btf_decl_tag_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_type *next_type; const struct btf_type *t = v->t; u32 next_type_id = t->type; struct btf *btf = env->btf; s32 component_idx; u32 vlen; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || !btf_type_is_decl_tag_target(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); component_idx = btf_type_decl_tag(t)->component_idx; if (component_idx != -1) { if (btf_type_is_var(next_type) || btf_type_is_typedef(next_type)) { btf_verifier_log_type(env, v->t, "Invalid component_idx"); return -EINVAL; } if (btf_type_is_struct(next_type)) { vlen = btf_type_vlen(next_type); } else { /* next_type should be a function */ next_type = btf_type_by_id(btf, next_type->type); vlen = btf_type_vlen(next_type); } if ((u32)component_idx >= vlen) { btf_verifier_log_type(env, v->t, "Invalid component_idx"); return -EINVAL; } } env_stack_pop_resolved(env, next_type_id, 0); return 0; } static void btf_decl_tag_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "type=%u component_idx=%d", t->type, btf_type_decl_tag(t)->component_idx); } static const struct btf_kind_operations decl_tag_ops = { .check_meta = btf_decl_tag_check_meta, .resolve = btf_decl_tag_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_decl_tag_log, .show = btf_df_show, }; static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_type *ret_type; const struct btf_param *args; const struct btf *btf; u16 nr_args, i; int err; btf = env->btf; args = (const struct btf_param *)(t + 1); nr_args = btf_type_vlen(t); /* Check func return type which could be "void" (t->type == 0) */ if (t->type) { u32 ret_type_id = t->type; ret_type = btf_type_by_id(btf, ret_type_id); if (!ret_type) { btf_verifier_log_type(env, t, "Invalid return type"); return -EINVAL; } if (btf_type_is_resolve_source_only(ret_type)) { btf_verifier_log_type(env, t, "Invalid return type"); return -EINVAL; } if (btf_type_needs_resolve(ret_type) && !env_type_is_resolved(env, ret_type_id)) { err = btf_resolve(env, ret_type, ret_type_id); if (err) return err; } /* Ensure the return type is a type that has a size */ if (!btf_type_id_size(btf, &ret_type_id, NULL)) { btf_verifier_log_type(env, t, "Invalid return type"); return -EINVAL; } } if (!nr_args) return 0; /* Last func arg type_id could be 0 if it is a vararg */ if (!args[nr_args - 1].type) { if (args[nr_args - 1].name_off) { btf_verifier_log_type(env, t, "Invalid arg#%u", nr_args); return -EINVAL; } nr_args--; } for (i = 0; i < nr_args; i++) { const struct btf_type *arg_type; u32 arg_type_id; arg_type_id = args[i].type; arg_type = btf_type_by_id(btf, arg_type_id); if (!arg_type) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); return -EINVAL; } if (btf_type_is_resolve_source_only(arg_type)) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); return -EINVAL; } if (args[i].name_off && (!btf_name_offset_valid(btf, args[i].name_off) || !btf_name_valid_identifier(btf, args[i].name_off))) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); return -EINVAL; } if (btf_type_needs_resolve(arg_type) && !env_type_is_resolved(env, arg_type_id)) { err = btf_resolve(env, arg_type, arg_type_id); if (err) return err; } if (!btf_type_id_size(btf, &arg_type_id, NULL)) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); return -EINVAL; } } return 0; } static int btf_func_check(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_type *proto_type; const struct btf_param *args; const struct btf *btf; u16 nr_args, i; btf = env->btf; proto_type = btf_type_by_id(btf, t->type); if (!proto_type || !btf_type_is_func_proto(proto_type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; } args = (const struct btf_param *)(proto_type + 1); nr_args = btf_type_vlen(proto_type); for (i = 0; i < nr_args; i++) { if (!args[i].name_off && args[i].type) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); return -EINVAL; } } return 0; } static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_INT] = &int_ops, [BTF_KIND_PTR] = &ptr_ops, [BTF_KIND_ARRAY] = &array_ops, [BTF_KIND_STRUCT] = &struct_ops, [BTF_KIND_UNION] = &struct_ops, [BTF_KIND_ENUM] = &enum_ops, [BTF_KIND_FWD] = &fwd_ops, [BTF_KIND_TYPEDEF] = &modifier_ops, [BTF_KIND_VOLATILE] = &modifier_ops, [BTF_KIND_CONST] = &modifier_ops, [BTF_KIND_RESTRICT] = &modifier_ops, [BTF_KIND_FUNC] = &func_ops, [BTF_KIND_FUNC_PROTO] = &func_proto_ops, [BTF_KIND_VAR] = &var_ops, [BTF_KIND_DATASEC] = &datasec_ops, [BTF_KIND_FLOAT] = &float_ops, [BTF_KIND_DECL_TAG] = &decl_tag_ops, [BTF_KIND_TYPE_TAG] = &modifier_ops, [BTF_KIND_ENUM64] = &enum64_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { u32 saved_meta_left = meta_left; s32 var_meta_size; if (meta_left < sizeof(*t)) { btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu", env->log_type_id, meta_left, sizeof(*t)); return -EINVAL; } meta_left -= sizeof(*t); if (t->info & ~BTF_INFO_MASK) { btf_verifier_log(env, "[%u] Invalid btf_info:%x", env->log_type_id, t->info); return -EINVAL; } if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { btf_verifier_log(env, "[%u] Invalid kind:%u", env->log_type_id, BTF_INFO_KIND(t->info)); return -EINVAL; } if (!btf_name_offset_valid(env->btf, t->name_off)) { btf_verifier_log(env, "[%u] Invalid name_offset:%u", env->log_type_id, t->name_off); return -EINVAL; } var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left); if (var_meta_size < 0) return var_meta_size; meta_left -= var_meta_size; return saved_meta_left - meta_left; } static int btf_check_all_metas(struct btf_verifier_env *env) { struct btf *btf = env->btf; struct btf_header *hdr; void *cur, *end; hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; end = cur + hdr->type_len; env->log_type_id = btf->base_btf ? btf->start_id : 1; while (cur < end) { struct btf_type *t = cur; s32 meta_size; meta_size = btf_check_meta(env, t, end - cur); if (meta_size < 0) return meta_size; btf_add_type(env, t); cur += meta_size; env->log_type_id++; } return 0; } static bool btf_resolve_valid(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) { struct btf *btf = env->btf; if (!env_type_is_resolved(env, type_id)) return false; if (btf_type_is_struct(t) || btf_type_is_datasec(t)) return !btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); if (btf_type_is_decl_tag(t) || btf_type_is_func(t)) return btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || btf_type_is_var(t)) { t = btf_type_id_resolve(btf, &type_id); return t && !btf_type_is_modifier(t) && !btf_type_is_var(t) && !btf_type_is_datasec(t); } if (btf_type_is_array(t)) { const struct btf_array *array = btf_type_array(t); const struct btf_type *elem_type; u32 elem_type_id = array->type; u32 elem_size; elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); return elem_type && !btf_type_is_modifier(elem_type) && (array->nelems * elem_size == btf_resolved_type_size(btf, type_id)); } return false; } static int btf_resolve(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) { u32 save_log_type_id = env->log_type_id; const struct resolve_vertex *v; int err = 0; env->resolve_mode = RESOLVE_TBD; env_stack_push(env, t, type_id); while (!err && (v = env_stack_peak(env))) { env->log_type_id = v->type_id; err = btf_type_ops(v->t)->resolve(env, v); } env->log_type_id = type_id; if (err == -E2BIG) { btf_verifier_log_type(env, t, "Exceeded max resolving depth:%u", MAX_RESOLVE_DEPTH); } else if (err == -EEXIST) { btf_verifier_log_type(env, t, "Loop detected"); } /* Final sanity check */ if (!err && !btf_resolve_valid(env, t, type_id)) { btf_verifier_log_type(env, t, "Invalid resolve state"); err = -EINVAL; } env->log_type_id = save_log_type_id; return err; } static int btf_check_all_types(struct btf_verifier_env *env) { struct btf *btf = env->btf; const struct btf_type *t; u32 type_id, i; int err; err = env_resolve_init(env); if (err) return err; env->phase++; for (i = btf->base_btf ? 0 : 1; i < btf->nr_types; i++) { type_id = btf->start_id + i; t = btf_type_by_id(btf, type_id); env->log_type_id = type_id; if (btf_type_needs_resolve(t) && !env_type_is_resolved(env, type_id)) { err = btf_resolve(env, t, type_id); if (err) return err; } if (btf_type_is_func_proto(t)) { err = btf_func_proto_check(env, t); if (err) return err; } } return 0; } static int btf_parse_type_sec(struct btf_verifier_env *env) { const struct btf_header *hdr = &env->btf->hdr; int err; /* Type section must align to 4 bytes */ if (hdr->type_off & (sizeof(u32) - 1)) { btf_verifier_log(env, "Unaligned type_off"); return -EINVAL; } if (!env->btf->base_btf && !hdr->type_len) { btf_verifier_log(env, "No type found"); return -EINVAL; } err = btf_check_all_metas(env); if (err) return err; return btf_check_all_types(env); } static int btf_parse_str_sec(struct btf_verifier_env *env) { const struct btf_header *hdr; struct btf *btf = env->btf; const char *start, *end; hdr = &btf->hdr; start = btf->nohdr_data + hdr->str_off; end = start + hdr->str_len; if (end != btf->data + btf->data_size) { btf_verifier_log(env, "String section is not at the end"); return -EINVAL; } btf->strings = start; if (btf->base_btf && !hdr->str_len) return 0; if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || end[-1]) { btf_verifier_log(env, "Invalid string section"); return -EINVAL; } if (!btf->base_btf && start[0]) { btf_verifier_log(env, "Invalid string section"); return -EINVAL; } return 0; } static const size_t btf_sec_info_offset[] = { offsetof(struct btf_header, type_off), offsetof(struct btf_header, str_off), }; static int btf_sec_info_cmp(const void *a, const void *b) { const struct btf_sec_info *x = a; const struct btf_sec_info *y = b; return (int)(x->off - y->off) ? : (int)(x->len - y->len); } static int btf_check_sec_info(struct btf_verifier_env *env, u32 btf_data_size) { struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)]; u32 total, expected_total, i; const struct btf_header *hdr; const struct btf *btf; btf = env->btf; hdr = &btf->hdr; /* Populate the secs from hdr */ for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) secs[i] = *(struct btf_sec_info *)((void *)hdr + btf_sec_info_offset[i]); sort(secs, ARRAY_SIZE(btf_sec_info_offset), sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL); /* Check for gaps and overlap among sections */ total = 0; expected_total = btf_data_size - hdr->hdr_len; for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) { if (expected_total < secs[i].off) { btf_verifier_log(env, "Invalid section offset"); return -EINVAL; } if (total < secs[i].off) { /* gap */ btf_verifier_log(env, "Unsupported section found"); return -EINVAL; } if (total > secs[i].off) { btf_verifier_log(env, "Section overlap found"); return -EINVAL; } if (expected_total - total < secs[i].len) { btf_verifier_log(env, "Total section length too long"); return -EINVAL; } total += secs[i].len; } /* There is data other than hdr and known sections */ if (expected_total != total) { btf_verifier_log(env, "Unsupported section found"); return -EINVAL; } return 0; } static int btf_parse_hdr(struct btf_verifier_env *env) { u32 hdr_len, hdr_copy, btf_data_size; const struct btf_header *hdr; struct btf *btf; btf = env->btf; btf_data_size = btf->data_size; if (btf_data_size < offsetofend(struct btf_header, hdr_len)) { btf_verifier_log(env, "hdr_len not found"); return -EINVAL; } hdr = btf->data; hdr_len = hdr->hdr_len; if (btf_data_size < hdr_len) { btf_verifier_log(env, "btf_header not found"); return -EINVAL; } /* Ensure the unsupported header fields are zero */ if (hdr_len > sizeof(btf->hdr)) { u8 *expected_zero = btf->data + sizeof(btf->hdr); u8 *end = btf->data + hdr_len; for (; expected_zero < end; expected_zero++) { if (*expected_zero) { btf_verifier_log(env, "Unsupported btf_header"); return -E2BIG; } } } hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); memcpy(&btf->hdr, btf->data, hdr_copy); hdr = &btf->hdr; btf_verifier_log_hdr(env, btf_data_size); if (hdr->magic != BTF_MAGIC) { btf_verifier_log(env, "Invalid magic"); return -EINVAL; } if (hdr->version != BTF_VERSION) { btf_verifier_log(env, "Unsupported version"); return -ENOTSUPP; } if (hdr->flags) { btf_verifier_log(env, "Unsupported flags"); return -ENOTSUPP; } if (!btf->base_btf && btf_data_size == hdr->hdr_len) { btf_verifier_log(env, "No data"); return -EINVAL; } return btf_check_sec_info(env, btf_data_size); } static const char *alloc_obj_fields[] = { "bpf_spin_lock", "bpf_list_head", "bpf_list_node", "bpf_rb_root", "bpf_rb_node", "bpf_refcount", }; static struct btf_struct_metas * btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) { struct btf_struct_metas *tab = NULL; struct btf_id_set *aof; int i, n, id, ret; BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0); BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32)); aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN); if (!aof) return ERR_PTR(-ENOMEM); aof->cnt = 0; for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) { /* Try to find whether this special type exists in user BTF, and * if so remember its ID so we can easily find it among members * of structs that we iterate in the next loop. */ struct btf_id_set *new_aof; id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT); if (id < 0) continue; new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; goto free_aof; } aof = new_aof; aof->ids[aof->cnt++] = id; } n = btf_nr_types(btf); for (i = 1; i < n; i++) { /* Try to find if there are kptrs in user BTF and remember their ID */ struct btf_id_set *new_aof; struct btf_field_info tmp; const struct btf_type *t; t = btf_type_by_id(btf, i); if (!t) { ret = -EINVAL; goto free_aof; } ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR); if (ret != BTF_FIELD_FOUND) continue; new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; goto free_aof; } aof = new_aof; aof->ids[aof->cnt++] = i; } if (!aof->cnt) { kfree(aof); return NULL; } sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL); for (i = 1; i < n; i++) { struct btf_struct_metas *new_tab; const struct btf_member *member; struct btf_struct_meta *type; struct btf_record *record; const struct btf_type *t; int j, tab_cnt; t = btf_type_by_id(btf, i); if (!__btf_type_is_struct(t)) continue; cond_resched(); for_each_member(j, t, member) { if (btf_id_set_contains(aof, member->type)) goto parse; } continue; parse: tab_cnt = tab ? tab->cnt : 0; new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]), GFP_KERNEL | __GFP_NOWARN); if (!new_tab) { ret = -ENOMEM; goto free; } if (!tab) new_tab->cnt = 0; tab = new_tab; type = &tab->types[tab->cnt]; type->btf_id = i; record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | BPF_KPTR, t->size); /* The record cannot be unset, treat it as an error if so */ if (IS_ERR_OR_NULL(record)) { ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT; goto free; } type->record = record; tab->cnt++; } kfree(aof); return tab; free: btf_struct_metas_free(tab); free_aof: kfree(aof); return ERR_PTR(ret); } struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id) { struct btf_struct_metas *tab; BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0); tab = btf->struct_meta_tab; if (!tab) return NULL; return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func); } static int btf_check_type_tags(struct btf_verifier_env *env, struct btf *btf, int start_id) { int i, n, good_id = start_id - 1; bool in_tags; n = btf_nr_types(btf); for (i = start_id; i < n; i++) { const struct btf_type *t; int chain_limit = 32; u32 cur_id = i; t = btf_type_by_id(btf, i); if (!t) return -EINVAL; if (!btf_type_is_modifier(t)) continue; cond_resched(); in_tags = btf_type_is_type_tag(t); while (btf_type_is_modifier(t)) { if (!chain_limit--) { btf_verifier_log(env, "Max chain length or cycle detected"); return -ELOOP; } if (btf_type_is_type_tag(t)) { if (!in_tags) { btf_verifier_log(env, "Type tags don't precede modifiers"); return -EINVAL; } } else if (in_tags) { in_tags = false; } if (cur_id <= good_id) break; /* Move to next type */ cur_id = t->type; t = btf_type_by_id(btf, cur_id); if (!t) return -EINVAL; } good_id = i; } return 0; } static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size) { u32 log_true_size; int err; err = bpf_vlog_finalize(log, &log_true_size); if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) && copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size), &log_true_size, sizeof(log_true_size))) err = -EFAULT; return err; } static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel); char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf); struct btf_struct_metas *struct_meta_tab; struct btf_verifier_env *env = NULL; struct btf *btf = NULL; u8 *data; int err, ret; if (attr->btf_size > BTF_MAX_SIZE) return ERR_PTR(-E2BIG); env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); /* user could have requested verbose verifier output * and supplied buffer to store the verification trace */ err = bpf_vlog_init(&env->log, attr->btf_log_level, log_ubuf, attr->btf_log_size); if (err) goto errout_free; btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); if (!btf) { err = -ENOMEM; goto errout; } env->btf = btf; data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { err = -ENOMEM; goto errout; } btf->data = data; btf->data_size = attr->btf_size; if (copy_from_bpfptr(data, btf_data, attr->btf_size)) { err = -EFAULT; goto errout; } err = btf_parse_hdr(env); if (err) goto errout; btf->nohdr_data = btf->data + btf->hdr.hdr_len; err = btf_parse_str_sec(env); if (err) goto errout; err = btf_parse_type_sec(env); if (err) goto errout; err = btf_check_type_tags(env, btf, 1); if (err) goto errout; struct_meta_tab = btf_parse_struct_metas(&env->log, btf); if (IS_ERR(struct_meta_tab)) { err = PTR_ERR(struct_meta_tab); goto errout; } btf->struct_meta_tab = struct_meta_tab; if (struct_meta_tab) { int i; for (i = 0; i < struct_meta_tab->cnt; i++) { err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record); if (err < 0) goto errout_meta; } } err = finalize_log(&env->log, uattr, uattr_size); if (err) goto errout_free; btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; errout_meta: btf_free_struct_meta_tab(btf); errout: /* overwrite err with -ENOSPC or -EFAULT */ ret = finalize_log(&env->log, uattr, uattr_size); if (ret) err = ret; errout_free: btf_verifier_env_free(env); if (btf) btf_free(btf); return ERR_PTR(err); } extern char __start_BTF[]; extern char __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) #define BPF_LINK_TYPE(_id, _name) static union { struct bpf_ctx_convert { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ prog_ctx_type _id##_prog; \ kern_ctx_type _id##_kern; #include <linux/bpf_types.h> #undef BPF_PROG_TYPE } *__t; /* 't' is written once under lock. Read many times. */ const struct btf_type *t; } bpf_ctx_convert; enum { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ __ctx_convert##_id, #include <linux/bpf_types.h> #undef BPF_PROG_TYPE __ctx_convert_unused, /* to avoid empty enum in extreme .config */ }; static u8 bpf_ctx_convert_map[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = __ctx_convert##_id, #include <linux/bpf_types.h> #undef BPF_PROG_TYPE 0, /* avoid empty array */ }; #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type) { const struct btf_type *conv_struct; const struct btf_member *ctx_type; conv_struct = bpf_ctx_convert.t; if (!conv_struct) return NULL; /* prog_type is valid bpf program type. No need for bounds check. */ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; /* ctx_type is a pointer to prog_ctx_type in vmlinux. * Like 'struct __sk_buff' */ return btf_type_by_id(btf_vmlinux, ctx_type->type); } static int find_kern_ctx_type_id(enum bpf_prog_type prog_type) { const struct btf_type *conv_struct; const struct btf_member *ctx_type; conv_struct = bpf_ctx_convert.t; if (!conv_struct) return -EFAULT; /* prog_type is valid bpf program type. No need for bounds check. */ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1; /* ctx_type is a pointer to prog_ctx_type in vmlinux. * Like 'struct sk_buff' */ return ctx_type->type; } bool btf_is_projection_of(const char *pname, const char *tname) { if (strcmp(pname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0) return true; if (strcmp(pname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0) return true; return false; } bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) { const struct btf_type *ctx_type; const char *tname, *ctx_tname; t = btf_type_by_id(btf, t->type); /* KPROBE programs allow bpf_user_pt_regs_t typedef, which we need to * check before we skip all the typedef below. */ if (prog_type == BPF_PROG_TYPE_KPROBE) { while (btf_type_is_modifier(t) && !btf_type_is_typedef(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_typedef(t)) { tname = btf_name_by_offset(btf, t->name_off); if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0) return true; } } while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!btf_type_is_struct(t)) { /* Only pointer to struct is supported for now. * That means that BPF_PROG_TYPE_TRACEPOINT with BTF * is not supported yet. * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. */ return false; } tname = btf_name_by_offset(btf, t->name_off); if (!tname) { bpf_log(log, "arg#%d struct doesn't have a name\n", arg); return false; } ctx_type = find_canonical_prog_ctx_type(prog_type); if (!ctx_type) { bpf_log(log, "btf_vmlinux is malformed\n"); /* should not happen */ return false; } again: ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); if (!ctx_tname) { /* should not happen */ bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); return false; } /* program types without named context types work only with arg:ctx tag */ if (ctx_tname[0] == '\0') return false; /* only compare that prog's ctx type name is the same as * kernel expects. No need to compare field by field. * It's ok for bpf prog to do: * struct __sk_buff {}; * int socket_filter_bpf_prog(struct __sk_buff *skb) * { // no fields of skb are ever used } */ if (btf_is_projection_of(ctx_tname, tname)) return true; if (strcmp(ctx_tname, tname)) { /* bpf_user_pt_regs_t is a typedef, so resolve it to * underlying struct and check name again */ if (!btf_type_is_modifier(ctx_type)) return false; while (btf_type_is_modifier(ctx_type)) ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); goto again; } return true; } /* forward declarations for arch-specific underlying types of * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still * works correctly with __builtin_types_compatible_p() on respective * architectures */ struct user_regs_struct; struct user_pt_regs; static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int arg, enum bpf_prog_type prog_type, enum bpf_attach_type attach_type) { const struct btf_type *ctx_type; const char *tname, *ctx_tname; if (!btf_is_ptr(t)) { bpf_log(log, "arg#%d type isn't a pointer\n", arg); return -EINVAL; } t = btf_type_by_id(btf, t->type); /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */ if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) { while (btf_type_is_modifier(t) && !btf_type_is_typedef(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_typedef(t)) { tname = btf_name_by_offset(btf, t->name_off); if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0) return 0; } } /* all other program types don't use typedefs for context type */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); /* `void *ctx __arg_ctx` is always valid */ if (btf_type_is_void(t)) return 0; tname = btf_name_by_offset(btf, t->name_off); if (str_is_empty(tname)) { bpf_log(log, "arg#%d type doesn't have a name\n", arg); return -EINVAL; } /* special cases */ switch (prog_type) { case BPF_PROG_TYPE_KPROBE: if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) return 0; break; case BPF_PROG_TYPE_PERF_EVENT: if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) && __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) return 0; if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) && __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0) return 0; if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) && __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0) return 0; break; case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; break; case BPF_PROG_TYPE_TRACING: switch (attach_type) { case BPF_TRACE_RAW_TP: /* tp_btf program is TRACING, so need special case here */ if (__btf_type_is_struct(t) && strcmp(tname, "bpf_raw_tracepoint_args") == 0) return 0; /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; break; case BPF_TRACE_ITER: /* allow struct bpf_iter__xxx types only */ if (__btf_type_is_struct(t) && strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0) return 0; break; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; break; default: break; } break; case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; break; case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: return 0; /* anything goes */ default: break; } ctx_type = find_canonical_prog_ctx_type(prog_type); if (!ctx_type) { /* should not happen */ bpf_log(log, "btf_vmlinux is malformed\n"); return -EINVAL; } /* resolve typedefs and check that underlying structs are matching as well */ while (btf_type_is_modifier(ctx_type)) ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); /* if program type doesn't have distinctly named struct type for * context, then __arg_ctx argument can only be `void *`, which we * already checked above */ if (!__btf_type_is_struct(ctx_type)) { bpf_log(log, "arg#%d should be void pointer\n", arg); return -EINVAL; } ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) { bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname); return -EINVAL; } return 0; } static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) { if (!btf_is_prog_ctx_type(log, btf, t, prog_type, arg)) return -ENOENT; return find_kern_ctx_type_id(prog_type); } int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type) { const struct btf_member *kctx_member; const struct btf_type *conv_struct; const struct btf_type *kctx_type; u32 kctx_type_id; conv_struct = bpf_ctx_convert.t; /* get member for kernel ctx type */ kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1; kctx_type_id = kctx_member->type; kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id); if (!btf_type_is_struct(kctx_type)) { bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id); return -EINVAL; } return kctx_type_id; } BTF_ID_LIST(bpf_ctx_convert_btf_id) BTF_ID(struct, bpf_ctx_convert) static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name, void *data, unsigned int data_size) { struct btf *btf = NULL; int err; if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) return ERR_PTR(-ENOENT); btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); if (!btf) { err = -ENOMEM; goto errout; } env->btf = btf; btf->data = data; btf->data_size = data_size; btf->kernel_btf = true; snprintf(btf->name, sizeof(btf->name), "%s", name); err = btf_parse_hdr(env); if (err) goto errout; btf->nohdr_data = btf->data + btf->hdr.hdr_len; err = btf_parse_str_sec(env); if (err) goto errout; err = btf_check_all_metas(env); if (err) goto errout; err = btf_check_type_tags(env, btf, 1); if (err) goto errout; refcount_set(&btf->refcnt, 1); return btf; errout: if (btf) { kvfree(btf->types); kfree(btf); } return ERR_PTR(err); } struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf; int err; env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); log = &env->log; log->level = BPF_LOG_KERNEL; btf = btf_parse_base(env, "vmlinux", __start_BTF, __stop_BTF - __start_BTF); if (IS_ERR(btf)) goto err_out; /* btf_parse_vmlinux() runs under bpf_verifier_lock */ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); err = btf_alloc_id(btf); if (err) { btf_free(btf); btf = ERR_PTR(err); } err_out: btf_verifier_env_free(env); return btf; } /* If .BTF_ids section was created with distilled base BTF, both base and * split BTF ids will need to be mapped to actual base/split ids for * BTF now that it has been relocated. */ static __u32 btf_relocate_id(const struct btf *btf, __u32 id) { if (!btf->base_btf || !btf->base_id_map) return id; return btf->base_id_map[id]; } #ifdef CONFIG_DEBUG_INFO_BTF_MODULES static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size, void *base_data, unsigned int base_data_size) { struct btf *btf = NULL, *vmlinux_btf, *base_btf = NULL; struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; int err = 0; vmlinux_btf = bpf_get_btf_vmlinux(); if (IS_ERR(vmlinux_btf)) return vmlinux_btf; if (!vmlinux_btf) return ERR_PTR(-EINVAL); env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); log = &env->log; log->level = BPF_LOG_KERNEL; if (base_data) { base_btf = btf_parse_base(env, ".BTF.base", base_data, base_data_size); if (IS_ERR(base_btf)) { err = PTR_ERR(base_btf); goto errout; } } else { base_btf = vmlinux_btf; } btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); if (!btf) { err = -ENOMEM; goto errout; } env->btf = btf; btf->base_btf = base_btf; btf->start_id = base_btf->nr_types; btf->start_str_off = base_btf->hdr.str_len; btf->kernel_btf = true; snprintf(btf->name, sizeof(btf->name), "%s", module_name); btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN); if (!btf->data) { err = -ENOMEM; goto errout; } btf->data_size = data_size; err = btf_parse_hdr(env); if (err) goto errout; btf->nohdr_data = btf->data + btf->hdr.hdr_len; err = btf_parse_str_sec(env); if (err) goto errout; err = btf_check_all_metas(env); if (err) goto errout; err = btf_check_type_tags(env, btf, btf_nr_types(base_btf)); if (err) goto errout; if (base_btf != vmlinux_btf) { err = btf_relocate(btf, vmlinux_btf, &btf->base_id_map); if (err) goto errout; btf_free(base_btf); base_btf = vmlinux_btf; } btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; errout: btf_verifier_env_free(env); if (!IS_ERR(base_btf) && base_btf != vmlinux_btf) btf_free(base_btf); if (btf) { kvfree(btf->data); kvfree(btf->types); kfree(btf); } return ERR_PTR(err); } #endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { struct bpf_prog *tgt_prog = prog->aux->dst_prog; if (tgt_prog) return tgt_prog->aux->btf; else return prog->aux->attach_btf; } static bool is_int_ptr(struct btf *btf, const struct btf_type *t) { /* skip modifiers */ t = btf_type_skip_modifiers(btf, t->type, NULL); return btf_type_is_int(t); } static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off) { const struct btf_param *args; const struct btf_type *t; u32 offset = 0, nr_args; int i; if (!func_proto) return off / 8; nr_args = btf_type_vlen(func_proto); args = (const struct btf_param *)(func_proto + 1); for (i = 0; i < nr_args; i++) { t = btf_type_skip_modifiers(btf, args[i].type, NULL); offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); if (off < offset) return i; } t = btf_type_skip_modifiers(btf, func_proto->type, NULL); offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); if (off < offset) return nr_args; return nr_args + 1; } static bool prog_args_trusted(const struct bpf_prog *prog) { enum bpf_attach_type atype = prog->expected_attach_type; switch (prog->type) { case BPF_PROG_TYPE_TRACING: return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER; case BPF_PROG_TYPE_LSM: return bpf_lsm_is_trusted(prog); case BPF_PROG_TYPE_STRUCT_OPS: return true; default: return false; } } int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, u32 arg_no) { const struct btf_param *args; const struct btf_type *t; int off = 0, i; u32 sz; args = btf_params(func_proto); for (i = 0; i < arg_no; i++) { t = btf_type_by_id(btf, args[i].type); t = btf_resolve_size(btf, t, &sz); if (IS_ERR(t)) return PTR_ERR(t); off += roundup(sz, 8); } return off; } struct bpf_raw_tp_null_args { const char *func; u64 mask; }; static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { /* sched */ { "sched_pi_setprio", 0x10 }, /* ... from sched_numa_pair_template event class */ { "sched_stick_numa", 0x100 }, { "sched_swap_numa", 0x100 }, /* afs */ { "afs_make_fs_call", 0x10 }, { "afs_make_fs_calli", 0x10 }, { "afs_make_fs_call1", 0x10 }, { "afs_make_fs_call2", 0x10 }, { "afs_protocol_error", 0x1 }, { "afs_flock_ev", 0x10 }, /* cachefiles */ { "cachefiles_lookup", 0x1 | 0x200 }, { "cachefiles_unlink", 0x1 }, { "cachefiles_rename", 0x1 }, { "cachefiles_prep_read", 0x1 }, { "cachefiles_mark_active", 0x1 }, { "cachefiles_mark_failed", 0x1 }, { "cachefiles_mark_inactive", 0x1 }, { "cachefiles_vfs_error", 0x1 }, { "cachefiles_io_error", 0x1 }, { "cachefiles_ondemand_open", 0x1 }, { "cachefiles_ondemand_copen", 0x1 }, { "cachefiles_ondemand_close", 0x1 }, { "cachefiles_ondemand_read", 0x1 }, { "cachefiles_ondemand_cread", 0x1 }, { "cachefiles_ondemand_fd_write", 0x1 }, { "cachefiles_ondemand_fd_release", 0x1 }, /* ext4, from ext4__mballoc event class */ { "ext4_mballoc_discard", 0x10 }, { "ext4_mballoc_free", 0x10 }, /* fib */ { "fib_table_lookup", 0x100 }, /* filelock */ /* ... from filelock_lock event class */ { "posix_lock_inode", 0x10 }, { "fcntl_setlk", 0x10 }, { "locks_remove_posix", 0x10 }, { "flock_lock_inode", 0x10 }, /* ... from filelock_lease event class */ { "break_lease_noblock", 0x10 }, { "break_lease_block", 0x10 }, { "break_lease_unblock", 0x10 }, { "generic_delete_lease", 0x10 }, { "time_out_leases", 0x10 }, /* host1x */ { "host1x_cdma_push_gather", 0x10000 }, /* huge_memory */ { "mm_khugepaged_scan_pmd", 0x10 }, { "mm_collapse_huge_page_isolate", 0x1 }, { "mm_khugepaged_scan_file", 0x10 }, { "mm_khugepaged_collapse_file", 0x10 }, /* kmem */ { "mm_page_alloc", 0x1 }, { "mm_page_pcpu_drain", 0x1 }, /* .. from mm_page event class */ { "mm_page_alloc_zone_locked", 0x1 }, /* netfs */ { "netfs_failure", 0x10 }, /* power */ { "device_pm_callback_start", 0x10 }, /* qdisc */ { "qdisc_dequeue", 0x1000 }, /* rxrpc */ { "rxrpc_recvdata", 0x1 }, { "rxrpc_resend", 0x10 }, /* sunrpc */ { "xs_stream_read_data", 0x1 }, /* ... from xprt_cong_event event class */ { "xprt_reserve_cong", 0x10 }, { "xprt_release_cong", 0x10 }, { "xprt_get_cong", 0x10 }, { "xprt_put_cong", 0x10 }, /* tcp */ { "tcp_send_reset", 0x11 }, /* tegra_apb_dma */ { "tegra_dma_tx_status", 0x100 }, /* timer_migration */ { "tmigr_update_events", 0x1 }, /* writeback, from writeback_folio_template event class */ { "writeback_dirty_folio", 0x10 }, { "folio_wait_writeback", 0x10 }, /* rdma */ { "mr_integ_alloc", 0x2000 }, /* bpf_testmod */ { "bpf_testmod_test_read", 0x0 }, }; bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const struct btf_type *t = prog->aux->attach_func_proto; struct bpf_prog *tgt_prog = prog->aux->dst_prog; struct btf *btf = bpf_prog_get_target_btf(prog); const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; bool ptr_err_raw_tp = false; const char *tag_value; u32 nr_args, arg; int i, ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", tname, off); return false; } arg = get_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. */ nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS; if (prog->aux->attach_btf_trace) { /* skip first 'void *__data' argument in btf_trace_##name typedef */ args++; nr_args--; } if (arg > nr_args) { bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg + 1); return false; } if (arg == nr_args) { switch (prog->expected_attach_type) { case BPF_LSM_MAC: /* mark we are accessing the return value */ info->is_retval = true; fallthrough; case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to * int LSM hooks, they use MODIFY_RETURN trampolines. * * While the LSM programs are BPF_MODIFY_RETURN-like * the check: * * if (ret_type != 'int') * return -EINVAL; * * is _not_ done here. This is still safe as LSM hooks * have only void and int return types. */ if (!t) return true; t = btf_type_by_id(btf, t->type); break; case BPF_MODIFY_RETURN: /* For now the BPF_MODIFY_RETURN can only be attached to * functions that return an int. */ if (!t) return false; t = btf_type_skip_modifiers(btf, t->type, NULL); if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", btf_type_str(t)); return false; } break; default: bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg + 1); return false; } } else { if (!t) /* Default prog with MAX_BPF_FUNC_REG_ARGS args */ return true; t = btf_type_by_id(btf, args[arg].type); } /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { bpf_log(log, "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, __btf_name_by_offset(btf, t->name_off), btf_type_str(t)); return false; } if (size != sizeof(u64)) { bpf_log(log, "func '%s' size %d must be 8\n", tname, size); return false; } /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; u32 type, flag; type = base_type(ctx_arg_info->reg_type); flag = type_flag(ctx_arg_info->reg_type); if (ctx_arg_info->offset == off && type == PTR_TO_BUF && (flag & PTR_MAYBE_NULL)) { info->reg_type = ctx_arg_info->reg_type; return true; } } if (t->type == 0) /* This is a pointer to void. * It is the same as scalar from the verifier safety pov. * No further pointer walking is allowed. */ return true; if (is_int_ptr(btf, t)) return true; /* this is a pointer to another type */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; if (ctx_arg_info->offset == off) { if (!ctx_arg_info->btf_id) { bpf_log(log,"invalid btf_id for context argument offset %u\n", off); return false; } info->reg_type = ctx_arg_info->reg_type; info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; return true; } } info->reg_type = PTR_TO_BTF_ID; if (prog_args_trusted(prog)) info->reg_type |= PTR_TRUSTED; if (btf_param_match_suffix(btf, &args[arg], "__nullable")) info->reg_type |= PTR_MAYBE_NULL; if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { struct btf *btf = prog->aux->attach_btf; const struct btf_type *t; const char *tname; /* BTF lookups cannot fail, return false on error */ t = btf_type_by_id(btf, prog->aux->attach_btf_id); if (!t) return false; tname = btf_name_by_offset(btf, t->name_off); if (!tname) return false; /* Checked by bpf_check_attach_target */ tname += sizeof("btf_trace_") - 1; for (i = 0; i < ARRAY_SIZE(raw_tp_null_args); i++) { /* Is this a func with potential NULL args? */ if (strcmp(tname, raw_tp_null_args[i].func)) continue; if (raw_tp_null_args[i].mask & (0x1 << (arg * 4))) info->reg_type |= PTR_MAYBE_NULL; /* Is the current arg IS_ERR? */ if (raw_tp_null_args[i].mask & (0x2 << (arg * 4))) ptr_err_raw_tp = true; break; } /* If we don't know NULL-ness specification and the tracepoint * is coming from a loadable module, be conservative and mark * argument as PTR_MAYBE_NULL. */ if (i == ARRAY_SIZE(raw_tp_null_args) && btf_is_module(btf)) info->reg_type |= PTR_MAYBE_NULL; } if (tgt_prog) { enum bpf_prog_type tgt_type; if (tgt_prog->type == BPF_PROG_TYPE_EXT) tgt_type = tgt_prog->aux->saved_dst_prog_type; else tgt_type = tgt_prog->type; ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg); if (ret > 0) { info->btf = btf_vmlinux; info->btf_id = ret; return true; } else { return false; } } info->btf = btf; info->btf_id = t->type; t = btf_type_by_id(btf, t->type); if (btf_type_is_type_tag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); if (strcmp(tag_value, "user") == 0) info->reg_type |= MEM_USER; if (strcmp(tag_value, "percpu") == 0) info->reg_type |= MEM_PERCPU; } /* skip modifiers */ while (btf_type_is_modifier(t)) { info->btf_id = t->type; t = btf_type_by_id(btf, t->type); } if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", tname, arg, btf_type_str(t)); return false; } bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); /* Perform all checks on the validity of type for this argument, but if * we know it can be IS_ERR at runtime, scrub pointer type and mark as * scalar. */ if (ptr_err_raw_tp) { bpf_log(log, "marking pointer arg%d as scalar as it may encode error", arg); info->reg_type = SCALAR_VALUE; } return true; } EXPORT_SYMBOL_GPL(btf_ctx_access); enum bpf_struct_walk_result { /* < 0 error */ WALK_SCALAR = 0, WALK_PTR, WALK_STRUCT, }; static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; const char *tname, *mname, *tag_value; u32 vlen, elem_id, mid; again: if (btf_type_is_modifier(t)) t = btf_type_skip_modifiers(btf, t->type, NULL); tname = __btf_name_by_offset(btf, t->name_off); if (!btf_type_is_struct(t)) { bpf_log(log, "Type '%s' is not a struct\n", tname); return -EINVAL; } vlen = btf_type_vlen(t); if (BTF_INFO_KIND(t->info) == BTF_KIND_UNION && vlen != 1 && !(*flag & PTR_UNTRUSTED)) /* * walking unions yields untrusted pointers * with exception of __bpf_md_ptr and other * unions with a single member */ *flag |= PTR_UNTRUSTED; if (off + size > t->size) { /* If the last element is a variable size array, we may * need to relax the rule. */ struct btf_array *array_elem; if (vlen == 0) goto error; member = btf_type_member(t) + vlen - 1; mtype = btf_type_skip_modifiers(btf, member->type, NULL); if (!btf_type_is_array(mtype)) goto error; array_elem = (struct btf_array *)(mtype + 1); if (array_elem->nelems != 0) goto error; moff = __btf_member_bit_offset(t, member) / 8; if (off < moff) goto error; /* allow structure and integer */ t = btf_type_skip_modifiers(btf, array_elem->type, NULL); if (btf_type_is_int(t)) return WALK_SCALAR; if (!btf_type_is_struct(t)) goto error; off = (off - moff) % t->size; goto again; error: bpf_log(log, "access beyond struct %s at off %u size %u\n", tname, off, size); return -EACCES; } for_each_member(i, t, member) { /* offset of the field in bytes */ moff = __btf_member_bit_offset(t, member) / 8; if (off + size <= moff) /* won't find anything, field is already too far */ break; if (__btf_member_bitfield_size(t, member)) { u32 end_bit = __btf_member_bit_offset(t, member) + __btf_member_bitfield_size(t, member); /* off <= moff instead of off == moff because clang * does not generate a BTF member for anonymous * bitfield like the ":16" here: * struct { * int :16; * int x:8; * }; */ if (off <= moff && BITS_ROUNDUP_BYTES(end_bit) <= off + size) return WALK_SCALAR; /* off may be accessing a following member * * or * * Doing partial access at either end of this * bitfield. Continue on this case also to * treat it as not accessing this bitfield * and eventually error out as field not * found to keep it simple. * It could be relaxed if there was a legit * partial access case later. */ continue; } /* In case of "off" is pointing to holes of a struct */ if (off < moff) break; /* type of the field */ mid = member->type; mtype = btf_type_by_id(btf, member->type); mname = __btf_name_by_offset(btf, member->name_off); mtype = __btf_resolve_size(btf, mtype, &msize, &elem_type, &elem_id, &total_nelems, &mid); if (IS_ERR(mtype)) { bpf_log(log, "field %s doesn't have size\n", mname); return -EFAULT; } mtrue_end = moff + msize; if (off >= mtrue_end) /* no overlap with member, keep iterating */ continue; if (btf_type_is_array(mtype)) { u32 elem_idx; /* __btf_resolve_size() above helps to * linearize a multi-dimensional array. * * The logic here is treating an array * in a struct as the following way: * * struct outer { * struct inner array[2][2]; * }; * * looks like: * * struct outer { * struct inner array_elem0; * struct inner array_elem1; * struct inner array_elem2; * struct inner array_elem3; * }; * * When accessing outer->array[1][0], it moves * moff to "array_elem2", set mtype to * "struct inner", and msize also becomes * sizeof(struct inner). Then most of the * remaining logic will fall through without * caring the current member is an array or * not. * * Unlike mtype/msize/moff, mtrue_end does not * change. The naming difference ("_true") tells * that it is not always corresponding to * the current mtype/msize/moff. * It is the true end of the current * member (i.e. array in this case). That * will allow an int array to be accessed like * a scratch space, * i.e. allow access beyond the size of * the array's element as long as it is * within the mtrue_end boundary. */ /* skip empty array */ if (moff == mtrue_end) continue; msize /= total_nelems; elem_idx = (off - moff) / msize; moff += elem_idx * msize; mtype = elem_type; mid = elem_id; } /* the 'off' we're looking for is either equal to start * of this field or inside of this struct */ if (btf_type_is_struct(mtype)) { /* our field must be inside that union or struct */ t = mtype; /* return if the offset matches the member offset */ if (off == moff) { *next_btf_id = mid; return WALK_STRUCT; } /* adjust offset we're looking for */ off -= moff; goto again; } if (btf_type_is_ptr(mtype)) { const struct btf_type *stype, *t; enum bpf_type_flag tmp_flag = 0; u32 id; if (msize != size || off != moff) { bpf_log(log, "cannot access ptr member %s with moff %u in struct %s with off %u size %u\n", mname, moff, tname, off, size); return -EACCES; } /* check type tag */ t = btf_type_by_id(btf, mtype->type); if (btf_type_is_type_tag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); /* check __user tag */ if (strcmp(tag_value, "user") == 0) tmp_flag = MEM_USER; /* check __percpu tag */ if (strcmp(tag_value, "percpu") == 0) tmp_flag = MEM_PERCPU; /* check __rcu tag */ if (strcmp(tag_value, "rcu") == 0) tmp_flag = MEM_RCU; } stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; *flag |= tmp_flag; if (field_name) *field_name = mname; return WALK_PTR; } } /* Allow more flexible access within an int as long as * it is within mtrue_end. * Since mtrue_end could be the end of an array, * that also allows using an array of int as a scratch * space. e.g. skb->cb[]. */ if (off + size > mtrue_end && !(*flag & PTR_UNTRUSTED)) { bpf_log(log, "access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n", mname, mtrue_end, tname, off, size); return -EACCES; } return WALK_SCALAR; } bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off); return -EINVAL; } int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype __maybe_unused, u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name) { const struct btf *btf = reg->btf; enum bpf_type_flag tmp_flag = 0; const struct btf_type *t; u32 id = reg->btf_id; int err; while (type_is_alloc(reg->type)) { struct btf_struct_meta *meta; struct btf_record *rec; int i; meta = btf_find_struct_meta(btf, id); if (!meta) break; rec = meta->record; for (i = 0; i < rec->cnt; i++) { struct btf_field *field = &rec->fields[i]; u32 offset = field->offset; if (off < offset + field->size && offset < off + size) { bpf_log(log, "direct access to %s is disallowed\n", btf_field_type_name(field->type)); return -EACCES; } } break; } t = btf_type_by_id(btf, id); do { err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name); switch (err) { case WALK_PTR: /* For local types, the destination register cannot * become a pointer again. */ if (type_is_alloc(reg->type)) return SCALAR_VALUE; /* If we found the pointer or scalar on t+off, * we're done. */ *next_btf_id = id; *flag = tmp_flag; return PTR_TO_BTF_ID; case WALK_SCALAR: return SCALAR_VALUE; case WALK_STRUCT: /* We found nested struct, so continue the search * by diving in it. At this point the offset is * aligned with the new type, so set it to 0. */ t = btf_type_by_id(btf, id); off = 0; break; default: /* It's either error or unknown return value.. * scream and leave. */ if (WARN_ONCE(err > 0, "unknown btf_struct_walk return value")) return -EINVAL; return err; } } while (t); return -EINVAL; } /* Check that two BTF types, each specified as an BTF object + id, are exactly * the same. Trivial ID check is not enough due to module BTFs, because we can * end up with two different module BTFs, but IDs point to the common type in * vmlinux BTF. */ bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2) { if (id1 != id2) return false; if (btf1 == btf2) return true; return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2); } bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, const struct btf *need_btf, u32 need_type_id, bool strict) { const struct btf_type *type; enum bpf_type_flag flag = 0; int err; /* Are we already done? */ if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id)) return true; /* In case of strict type match, we do not walk struct, the top level * type match must succeed. When strict is true, off should have already * been 0. */ if (strict) return false; again: type = btf_type_by_id(btf, id); if (!type) return false; err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL); if (err != WALK_STRUCT) return false; /* We found nested struct object. If it matches * the requested ID, we're done. Otherwise let's * continue the search with offset 0 in the new * type. */ if (!btf_types_are_same(btf, id, need_btf, need_type_id)) { off = 0; goto again; } return true; } static int __get_type_size(struct btf *btf, u32 btf_id, const struct btf_type **ret_type) { const struct btf_type *t; *ret_type = btf_type_by_id(btf, 0); if (!btf_id) /* void */ return 0; t = btf_type_by_id(btf, btf_id); while (t && btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!t) return -EINVAL; *ret_type = t; if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) return t->size; return -EINVAL; } static u8 __get_type_fmodel_flags(const struct btf_type *t) { u8 flags = 0; if (__btf_type_is_struct(t)) flags |= BTF_FMODEL_STRUCT_ARG; if (btf_type_is_signed_int(t)) flags |= BTF_FMODEL_SIGNED_ARG; return flags; } int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *func, const char *tname, struct btf_func_model *m) { const struct btf_param *args; const struct btf_type *t; u32 i, nargs; int ret; if (!func) { /* BTF function prototype doesn't match the verifier types. * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args. */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { m->arg_size[i] = 8; m->arg_flags[i] = 0; } m->ret_size = 8; m->ret_flags = 0; m->nr_args = MAX_BPF_FUNC_REG_ARGS; return 0; } args = (const struct btf_param *)(func + 1); nargs = btf_type_vlen(func); if (nargs > MAX_BPF_FUNC_ARGS) { bpf_log(log, "The function %s has %d arguments. Too many.\n", tname, nargs); return -EINVAL; } ret = __get_type_size(btf, func->type, &t); if (ret < 0 || __btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", tname, btf_type_str(t)); return -EINVAL; } m->ret_size = ret; m->ret_flags = __get_type_fmodel_flags(t); for (i = 0; i < nargs; i++) { if (i == nargs - 1 && args[i].type == 0) { bpf_log(log, "The function %s with variable args is unsupported.\n", tname); return -EINVAL; } ret = __get_type_size(btf, args[i].type, &t); /* No support of struct argument size greater than 16 bytes */ if (ret < 0 || ret > 16) { bpf_log(log, "The function %s arg%d type %s is unsupported.\n", tname, i, btf_type_str(t)); return -EINVAL; } if (ret == 0) { bpf_log(log, "The function %s has malformed void argument.\n", tname); return -EINVAL; } m->arg_size[i] = ret; m->arg_flags[i] = __get_type_fmodel_flags(t); } m->nr_args = nargs; return 0; } /* Compare BTFs of two functions assuming only scalars and pointers to context. * t1 points to BTF_KIND_FUNC in btf1 * t2 points to BTF_KIND_FUNC in btf2 * Returns: * EINVAL - function prototype mismatch * EFAULT - verifier bug * 0 - 99% match. The last 1% is validated by the verifier. */ static int btf_check_func_type_match(struct bpf_verifier_log *log, struct btf *btf1, const struct btf_type *t1, struct btf *btf2, const struct btf_type *t2) { const struct btf_param *args1, *args2; const char *fn1, *fn2, *s1, *s2; u32 nargs1, nargs2, i; fn1 = btf_name_by_offset(btf1, t1->name_off); fn2 = btf_name_by_offset(btf2, t2->name_off); if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) { bpf_log(log, "%s() is not a global function\n", fn1); return -EINVAL; } if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) { bpf_log(log, "%s() is not a global function\n", fn2); return -EINVAL; } t1 = btf_type_by_id(btf1, t1->type); if (!t1 || !btf_type_is_func_proto(t1)) return -EFAULT; t2 = btf_type_by_id(btf2, t2->type); if (!t2 || !btf_type_is_func_proto(t2)) return -EFAULT; args1 = (const struct btf_param *)(t1 + 1); nargs1 = btf_type_vlen(t1); args2 = (const struct btf_param *)(t2 + 1); nargs2 = btf_type_vlen(t2); if (nargs1 != nargs2) { bpf_log(log, "%s() has %d args while %s() has %d args\n", fn1, nargs1, fn2, nargs2); return -EINVAL; } t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); if (t1->info != t2->info) { bpf_log(log, "Return type %s of %s() doesn't match type %s of %s()\n", btf_type_str(t1), fn1, btf_type_str(t2), fn2); return -EINVAL; } for (i = 0; i < nargs1; i++) { t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL); t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL); if (t1->info != t2->info) { bpf_log(log, "arg%d in %s() is %s while %s() has %s\n", i, fn1, btf_type_str(t1), fn2, btf_type_str(t2)); return -EINVAL; } if (btf_type_has_size(t1) && t1->size != t2->size) { bpf_log(log, "arg%d in %s() has size %d while %s() has %d\n", i, fn1, t1->size, fn2, t2->size); return -EINVAL; } /* global functions are validated with scalars and pointers * to context only. And only global functions can be replaced. * Hence type check only those types. */ if (btf_type_is_int(t1) || btf_is_any_enum(t1)) continue; if (!btf_type_is_ptr(t1)) { bpf_log(log, "arg%d in %s() has unrecognized type\n", i, fn1); return -EINVAL; } t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); if (!btf_type_is_struct(t1)) { bpf_log(log, "arg%d in %s() is not a pointer to context\n", i, fn1); return -EINVAL; } if (!btf_type_is_struct(t2)) { bpf_log(log, "arg%d in %s() is not a pointer to context\n", i, fn2); return -EINVAL; } /* This is an optional check to make program writing easier. * Compare names of structs and report an error to the user. * btf_prepare_func_args() already checked that t2 struct * is a context type. btf_prepare_func_args() will check * later that t1 struct is a context type as well. */ s1 = btf_name_by_offset(btf1, t1->name_off); s2 = btf_name_by_offset(btf2, t2->name_off); if (strcmp(s1, s2)) { bpf_log(log, "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n", i, fn1, s1, fn2, s2); return -EINVAL; } } return 0; } /* Compare BTFs of given program with BTF of target program */ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf2, const struct btf_type *t2) { struct btf *btf1 = prog->aux->btf; const struct btf_type *t1; u32 btf_id = 0; if (!prog->aux->func_info) { bpf_log(log, "Program extension requires BTF\n"); return -EINVAL; } btf_id = prog->aux->func_info[0].type_id; if (!btf_id) return -EFAULT; t1 = btf_type_by_id(btf1, btf_id); if (!t1 || !btf_type_is_func(t1)) return -EFAULT; return btf_check_func_type_match(log, btf1, t1, btf2, t2); } static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t) { const char *name; t = btf_type_by_id(btf, t->type); /* skip PTR */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); /* allow either struct or struct forward declaration */ if (btf_type_is_struct(t) || (btf_type_is_fwd(t) && btf_type_kflag(t) == 0)) { name = btf_str_by_offset(btf, t->name_off); return name && strcmp(name, "bpf_dynptr") == 0; } return false; } struct bpf_cand_cache { const char *name; u32 name_len; u16 kind; u16 cnt; struct { const struct btf *btf; u32 id; } cands[]; }; static DEFINE_MUTEX(cand_cache_mutex); static struct bpf_cand_cache * bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id); static int btf_get_ptr_to_btf_id(struct bpf_verifier_log *log, int arg_idx, const struct btf *btf, const struct btf_type *t) { struct bpf_cand_cache *cc; struct bpf_core_ctx ctx = { .btf = btf, .log = log, }; u32 kern_type_id, type_id; int err = 0; /* skip PTR and modifiers */ type_id = t->type; t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) { type_id = t->type; t = btf_type_by_id(btf, t->type); } mutex_lock(&cand_cache_mutex); cc = bpf_core_find_cands(&ctx, type_id); if (IS_ERR(cc)) { err = PTR_ERR(cc); bpf_log(log, "arg#%d reference type('%s %s') candidate matching error: %d\n", arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off), err); goto cand_cache_unlock; } if (cc->cnt != 1) { bpf_log(log, "arg#%d reference type('%s %s') %s\n", arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off), cc->cnt == 0 ? "has no matches" : "is ambiguous"); err = cc->cnt == 0 ? -ENOENT : -ESRCH; goto cand_cache_unlock; } if (btf_is_module(cc->cands[0].btf)) { bpf_log(log, "arg#%d reference type('%s %s') points to kernel module type (unsupported)\n", arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); err = -EOPNOTSUPP; goto cand_cache_unlock; } kern_type_id = cc->cands[0].id; cand_cache_unlock: mutex_unlock(&cand_cache_mutex); if (err) return err; return kern_type_id; } enum btf_arg_tag { ARG_TAG_CTX = BIT_ULL(0), ARG_TAG_NONNULL = BIT_ULL(1), ARG_TAG_TRUSTED = BIT_ULL(2), ARG_TAG_NULLABLE = BIT_ULL(3), ARG_TAG_ARENA = BIT_ULL(4), }; /* Process BTF of a function to produce high-level expectation of function * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information * is cached in subprog info for reuse. * Returns: * EFAULT - there is a verifier bug. Abort verification. * EINVAL - cannot convert BTF. * 0 - Successfully processed BTF and constructed argument expectations. */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) { bool is_global = subprog_aux(env, subprog)->linkage == BTF_FUNC_GLOBAL; struct bpf_subprog_info *sub = subprog_info(env, subprog); struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; const struct btf_type *t, *ref_t, *fn_t; u32 i, nargs, btf_id; const char *tname; if (sub->args_cached) return 0; if (!prog->aux->func_info) { bpf_log(log, "Verifier bug\n"); return -EFAULT; } btf_id = prog->aux->func_info[subprog].type_id; if (!btf_id) { if (!is_global) /* not fatal for static funcs */ return -EINVAL; bpf_log(log, "Global functions need valid BTF\n"); return -EFAULT; } fn_t = btf_type_by_id(btf, btf_id); if (!fn_t || !btf_type_is_func(fn_t)) { /* These checks were already done by the verifier while loading * struct bpf_func_info */ bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", subprog); return -EFAULT; } tname = btf_name_by_offset(btf, fn_t->name_off); if (prog->aux->func_info_aux[subprog].unreliable) { bpf_log(log, "Verifier bug in function %s()\n", tname); return -EFAULT; } if (prog_type == BPF_PROG_TYPE_EXT) prog_type = prog->aux->dst_prog->type; t = btf_type_by_id(btf, fn_t->type); if (!t || !btf_type_is_func_proto(t)) { bpf_log(log, "Invalid type of function %s()\n", tname); return -EFAULT; } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); if (nargs > MAX_BPF_FUNC_REG_ARGS) { if (!is_global) return -EINVAL; bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } /* check that function returns int, exception cb also requires this */ t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!btf_type_is_int(t) && !btf_is_any_enum(t)) { if (!is_global) return -EINVAL; bpf_log(log, "Global function %s() doesn't return scalar. Only those are supported.\n", tname); return -EINVAL; } /* Convert BTF function arguments into verifier types. * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { u32 tags = 0; int id = 0; /* 'arg:<tag>' decl_tag takes precedence over derivation of * register type from BTF type itself */ while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) { const struct btf_type *tag_t = btf_type_by_id(btf, id); const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4; /* disallow arg tags in static subprogs */ if (!is_global) { bpf_log(log, "arg#%d type tag is not supported in static functions\n", i); return -EOPNOTSUPP; } if (strcmp(tag, "ctx") == 0) { tags |= ARG_TAG_CTX; } else if (strcmp(tag, "trusted") == 0) { tags |= ARG_TAG_TRUSTED; } else if (strcmp(tag, "nonnull") == 0) { tags |= ARG_TAG_NONNULL; } else if (strcmp(tag, "nullable") == 0) { tags |= ARG_TAG_NULLABLE; } else if (strcmp(tag, "arena") == 0) { tags |= ARG_TAG_ARENA; } else { bpf_log(log, "arg#%d has unsupported set of tags\n", i); return -EOPNOTSUPP; } } if (id != -ENOENT) { bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id); return id; } t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!btf_type_is_ptr(t)) goto skip_pointer; if ((tags & ARG_TAG_CTX) || btf_is_prog_ctx_type(log, btf, t, prog_type, i)) { if (tags & ~ARG_TAG_CTX) { bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } if ((tags & ARG_TAG_CTX) && btf_validate_prog_ctx_type(log, btf, t, i, prog_type, prog->expected_attach_type)) return -EINVAL; sub->args[i].arg_type = ARG_PTR_TO_CTX; continue; } if (btf_is_dynptr_ptr(btf, t)) { if (tags) { bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY; continue; } if (tags & ARG_TAG_TRUSTED) { int kern_type_id; if (tags & ARG_TAG_NONNULL) { bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t); if (kern_type_id < 0) return kern_type_id; sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_TRUSTED; if (tags & ARG_TAG_NULLABLE) sub->args[i].arg_type |= PTR_MAYBE_NULL; sub->args[i].btf_id = kern_type_id; continue; } if (tags & ARG_TAG_ARENA) { if (tags & ~ARG_TAG_ARENA) { bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i); return -EINVAL; } sub->args[i].arg_type = ARG_PTR_TO_ARENA; continue; } if (is_global) { /* generic user data pointer */ u32 mem_size; if (tags & ARG_TAG_NULLABLE) { bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } t = btf_type_skip_modifiers(btf, t->type, NULL); ref_t = btf_resolve_size(btf, t, &mem_size); if (IS_ERR(ref_t)) { bpf_log(log, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), PTR_ERR(ref_t)); return -EINVAL; } sub->args[i].arg_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL; if (tags & ARG_TAG_NONNULL) sub->args[i].arg_type &= ~PTR_MAYBE_NULL; sub->args[i].mem_size = mem_size; continue; } skip_pointer: if (tags) { bpf_log(log, "arg#%d has pointer tag, but is not a pointer type\n", i); return -EINVAL; } if (btf_type_is_int(t) || btf_is_any_enum(t)) { sub->args[i].arg_type = ARG_ANYTHING; continue; } if (!is_global) return -EINVAL; bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", i, btf_type_str(t), tname); return -EINVAL; } sub->arg_cnt = nargs; sub->args_cached = true; return 0; } static void btf_type_show(const struct btf *btf, u32 type_id, void *obj, struct btf_show *show) { const struct btf_type *t = btf_type_by_id(btf, type_id); show->btf = btf; memset(&show->state, 0, sizeof(show->state)); memset(&show->obj, 0, sizeof(show->obj)); btf_type_ops(t)->show(btf, t, type_id, obj, 0, show); } __printf(2, 0) static void btf_seq_show(struct btf_show *show, const char *fmt, va_list args) { seq_vprintf((struct seq_file *)show->target, fmt, args); } int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m, u64 flags) { struct btf_show sseq; sseq.target = m; sseq.showfn = btf_seq_show; sseq.flags = flags; btf_type_show(btf, type_id, obj, &sseq); return sseq.state.status; } void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { (void) btf_type_seq_show_flags(btf, type_id, obj, m, BTF_SHOW_NONAME | BTF_SHOW_COMPACT | BTF_SHOW_ZERO | BTF_SHOW_UNSAFE); } struct btf_show_snprintf { struct btf_show show; int len_left; /* space left in string */ int len; /* length we would have written */ }; __printf(2, 0) static void btf_snprintf_show(struct btf_show *show, const char *fmt, va_list args) { struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show; int len; len = vsnprintf(show->target, ssnprintf->len_left, fmt, args); if (len < 0) { ssnprintf->len_left = 0; ssnprintf->len = len; } else if (len >= ssnprintf->len_left) { /* no space, drive on to get length we would have written */ ssnprintf->len_left = 0; ssnprintf->len += len; } else { ssnprintf->len_left -= len; ssnprintf->len += len; show->target += len; } } int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, char *buf, int len, u64 flags) { struct btf_show_snprintf ssnprintf; ssnprintf.show.target = buf; ssnprintf.show.flags = flags; ssnprintf.show.showfn = btf_snprintf_show; ssnprintf.len_left = len; ssnprintf.len = 0; btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf); /* If we encountered an error, return it. */ if (ssnprintf.show.state.status) return ssnprintf.show.state.status; /* Otherwise return length we would have written */ return ssnprintf.len; } #ifdef CONFIG_PROC_FS static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp) { const struct btf *btf = filp->private_data; seq_printf(m, "btf_id:\t%u\n", btf->id); } #endif static int btf_release(struct inode *inode, struct file *filp) { btf_put(filp->private_data); return 0; } const struct file_operations btf_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_btf_show_fdinfo, #endif .release = btf_release, }; static int __btf_new_fd(struct btf *btf) { return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); } int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { struct btf *btf; int ret; btf = btf_parse(attr, uattr, uattr_size); if (IS_ERR(btf)) return PTR_ERR(btf); ret = btf_alloc_id(btf); if (ret) { btf_free(btf); return ret; } /* * The BTF ID is published to the userspace. * All BTF free must go through call_rcu() from * now on (i.e. free by calling btf_put()). */ ret = __btf_new_fd(btf); if (ret < 0) btf_put(btf); return ret; } struct btf *btf_get_by_fd(int fd) { struct btf *btf; CLASS(fd, f)(fd); btf = __btf_get_by_fd(f); if (!IS_ERR(btf)) refcount_inc(&btf->refcnt); return btf; } int btf_get_info_by_fd(const struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_btf_info __user *uinfo; struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; char __user *uname; u32 uinfo_len, uname_len, name_len; int ret = 0; uinfo = u64_to_user_ptr(attr->info.info); uinfo_len = attr->info.info_len; info_copy = min_t(u32, uinfo_len, sizeof(info)); memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_copy)) return -EFAULT; info.id = btf->id; ubtf = u64_to_user_ptr(info.btf); btf_copy = min_t(u32, btf->data_size, info.btf_size); if (copy_to_user(ubtf, btf->data, btf_copy)) return -EFAULT; info.btf_size = btf->data_size; info.kernel_btf = btf->kernel_btf; uname = u64_to_user_ptr(info.name); uname_len = info.name_len; if (!uname ^ !uname_len) return -EINVAL; name_len = strlen(btf->name); info.name_len = name_len; if (uname) { if (uname_len >= name_len + 1) { if (copy_to_user(uname, btf->name, name_len + 1)) return -EFAULT; } else { char zero = '\0'; if (copy_to_user(uname, btf->name, uname_len - 1)) return -EFAULT; if (put_user(zero, uname + uname_len - 1)) return -EFAULT; /* let user-space know about too short buffer */ ret = -ENOSPC; } } if (copy_to_user(uinfo, &info, info_copy) || put_user(info_copy, &uattr->info.info_len)) return -EFAULT; return ret; } int btf_get_fd_by_id(u32 id) { struct btf *btf; int fd; rcu_read_lock(); btf = idr_find(&btf_idr, id); if (!btf || !refcount_inc_not_zero(&btf->refcnt)) btf = ERR_PTR(-ENOENT); rcu_read_unlock(); if (IS_ERR(btf)) return PTR_ERR(btf); fd = __btf_new_fd(btf); if (fd < 0) btf_put(btf); return fd; } u32 btf_obj_id(const struct btf *btf) { return btf->id; } bool btf_is_kernel(const struct btf *btf) { return btf->kernel_btf; } bool btf_is_module(const struct btf *btf) { return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0; } enum { BTF_MODULE_F_LIVE = (1 << 0), }; #ifdef CONFIG_DEBUG_INFO_BTF_MODULES struct btf_module { struct list_head list; struct module *module; struct btf *btf; struct bin_attribute *sysfs_attr; int flags; }; static LIST_HEAD(btf_modules); static DEFINE_MUTEX(btf_module_mutex); static ssize_t btf_module_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { const struct btf *btf = bin_attr->private; memcpy(buf, btf->data + off, len); return len; } static void purge_cand_cache(struct btf *btf); static int btf_module_notify(struct notifier_block *nb, unsigned long op, void *module) { struct btf_module *btf_mod, *tmp; struct module *mod = module; struct btf *btf; int err = 0; if (mod->btf_data_size == 0 || (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE && op != MODULE_STATE_GOING)) goto out; switch (op) { case MODULE_STATE_COMING: btf_mod = kzalloc(sizeof(*btf_mod), GFP_KERNEL); if (!btf_mod) { err = -ENOMEM; goto out; } btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size, mod->btf_base_data, mod->btf_base_data_size); if (IS_ERR(btf)) { kfree(btf_mod); if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) { pr_warn("failed to validate module [%s] BTF: %ld\n", mod->name, PTR_ERR(btf)); err = PTR_ERR(btf); } else { pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n"); } goto out; } err = btf_alloc_id(btf); if (err) { btf_free(btf); kfree(btf_mod); goto out; } purge_cand_cache(NULL); mutex_lock(&btf_module_mutex); btf_mod->module = module; btf_mod->btf = btf; list_add(&btf_mod->list, &btf_modules); mutex_unlock(&btf_module_mutex); if (IS_ENABLED(CONFIG_SYSFS)) { struct bin_attribute *attr; attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) goto out; sysfs_bin_attr_init(attr); attr->attr.name = btf->name; attr->attr.mode = 0444; attr->size = btf->data_size; attr->private = btf; attr->read = btf_module_read; err = sysfs_create_bin_file(btf_kobj, attr); if (err) { pr_warn("failed to register module [%s] BTF in sysfs: %d\n", mod->name, err); kfree(attr); err = 0; goto out; } btf_mod->sysfs_attr = attr; } break; case MODULE_STATE_LIVE: mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { if (btf_mod->module != module) continue; btf_mod->flags |= BTF_MODULE_F_LIVE; break; } mutex_unlock(&btf_module_mutex); break; case MODULE_STATE_GOING: mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { if (btf_mod->module != module) continue; list_del(&btf_mod->list); if (btf_mod->sysfs_attr) sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); purge_cand_cache(btf_mod->btf); btf_put(btf_mod->btf); kfree(btf_mod->sysfs_attr); kfree(btf_mod); break; } mutex_unlock(&btf_module_mutex); break; } out: return notifier_from_errno(err); } static struct notifier_block btf_module_nb = { .notifier_call = btf_module_notify, }; static int __init btf_module_init(void) { register_module_notifier(&btf_module_nb); return 0; } fs_initcall(btf_module_init); #endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ struct module *btf_try_get_module(const struct btf *btf) { struct module *res = NULL; #ifdef CONFIG_DEBUG_INFO_BTF_MODULES struct btf_module *btf_mod, *tmp; mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { if (btf_mod->btf != btf) continue; /* We must only consider module whose __init routine has * finished, hence we must check for BTF_MODULE_F_LIVE flag, * which is set from the notifier callback for * MODULE_STATE_LIVE. */ if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module)) res = btf_mod->module; break; } mutex_unlock(&btf_module_mutex); #endif return res; } /* Returns struct btf corresponding to the struct module. * This function can return NULL or ERR_PTR. */ static struct btf *btf_get_module_btf(const struct module *module) { #ifdef CONFIG_DEBUG_INFO_BTF_MODULES struct btf_module *btf_mod, *tmp; #endif struct btf *btf = NULL; if (!module) { btf = bpf_get_btf_vmlinux(); if (!IS_ERR_OR_NULL(btf)) btf_get(btf); return btf; } #ifdef CONFIG_DEBUG_INFO_BTF_MODULES mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { if (btf_mod->module != module) continue; btf_get(btf_mod->btf); btf = btf_mod->btf; break; } mutex_unlock(&btf_module_mutex); #endif return btf; } static int check_btf_kconfigs(const struct module *module, const char *feature) { if (!module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { pr_err("missing vmlinux BTF, cannot register %s\n", feature); return -ENOENT; } if (module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) pr_warn("missing module BTF, cannot register %s\n", feature); return 0; } BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags) { struct btf *btf = NULL; int btf_obj_fd = 0; long ret; if (flags) return -EINVAL; if (name_sz <= 1 || name[name_sz - 1]) return -EINVAL; ret = bpf_find_btf_id(name, kind, &btf); if (ret > 0 && btf_is_module(btf)) { btf_obj_fd = __btf_new_fd(btf); if (btf_obj_fd < 0) { btf_put(btf); return btf_obj_fd; } return ret | (((u64)btf_obj_fd) << 32); } if (ret > 0) btf_put(btf); return ret; } const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .func = bpf_btf_find_by_name_kind, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) #define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type) BTF_TRACING_TYPE_xxx #undef BTF_TRACING_TYPE /* Validate well-formedness of iter argument type. * On success, return positive BTF ID of iter state's STRUCT type. * On error, negative error is returned. */ int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx) { const struct btf_param *arg; const struct btf_type *t; const char *name; int btf_id; if (btf_type_vlen(func) <= arg_idx) return -EINVAL; arg = &btf_params(func)[arg_idx]; t = btf_type_skip_modifiers(btf, arg->type, NULL); if (!t || !btf_type_is_ptr(t)) return -EINVAL; t = btf_type_skip_modifiers(btf, t->type, &btf_id); if (!t || !__btf_type_is_struct(t)) return -EINVAL; name = btf_name_by_offset(btf, t->name_off); if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1)) return -EINVAL; return btf_id; } static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, const struct btf_type *func, u32 func_flags) { u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); const char *sfx, *iter_name; const struct btf_type *t; char exp_name[128]; u32 nr_args; int btf_id; /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */ if (!flags || (flags & (flags - 1))) return -EINVAL; /* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */ nr_args = btf_type_vlen(func); if (nr_args < 1) return -EINVAL; btf_id = btf_check_iter_arg(btf, func, 0); if (btf_id < 0) return btf_id; /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to * fit nicely in stack slots */ t = btf_type_by_id(btf, btf_id); if (t->size == 0 || (t->size % 8)) return -EINVAL; /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *) * naming pattern */ iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1; if (flags & KF_ITER_NEW) sfx = "new"; else if (flags & KF_ITER_NEXT) sfx = "next"; else /* (flags & KF_ITER_DESTROY) */ sfx = "destroy"; snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx); if (strcmp(func_name, exp_name)) return -EINVAL; /* only iter constructor should have extra arguments */ if (!(flags & KF_ITER_NEW) && nr_args != 1) return -EINVAL; if (flags & KF_ITER_NEXT) { /* bpf_iter_<type>_next() should return pointer */ t = btf_type_skip_modifiers(btf, func->type, NULL); if (!t || !btf_type_is_ptr(t)) return -EINVAL; } if (flags & KF_ITER_DESTROY) { /* bpf_iter_<type>_destroy() should return void */ t = btf_type_by_id(btf, func->type); if (!t || !btf_type_is_void(t)) return -EINVAL; } return 0; } static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) { const struct btf_type *func; const char *func_name; int err; /* any kfunc should be FUNC -> FUNC_PROTO */ func = btf_type_by_id(btf, func_id); if (!func || !btf_type_is_func(func)) return -EINVAL; /* sanity check kfunc name */ func_name = btf_name_by_offset(btf, func->name_off); if (!func_name || !func_name[0]) return -EINVAL; func = btf_type_by_id(btf, func->type); if (!func || !btf_type_is_func_proto(func)) return -EINVAL; if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) { err = btf_check_iter_kfuncs(btf, func_name, func, func_flags); if (err) return err; } return 0; } /* Kernel Function (kfunc) BTF ID set registration API */ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, const struct btf_kfunc_id_set *kset) { struct btf_kfunc_hook_filter *hook_filter; struct btf_id_set8 *add_set = kset->set; bool vmlinux_set = !btf_is_module(btf); bool add_filter = !!kset->filter; struct btf_kfunc_set_tab *tab; struct btf_id_set8 *set; u32 set_cnt, i; int ret; if (hook >= BTF_KFUNC_HOOK_MAX) { ret = -EINVAL; goto end; } if (!add_set->cnt) return 0; tab = btf->kfunc_set_tab; if (tab && add_filter) { u32 i; hook_filter = &tab->hook_filters[hook]; for (i = 0; i < hook_filter->nr_filters; i++) { if (hook_filter->filters[i] == kset->filter) { add_filter = false; break; } } if (add_filter && hook_filter->nr_filters == BTF_KFUNC_FILTER_MAX_CNT) { ret = -E2BIG; goto end; } } if (!tab) { tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN); if (!tab) return -ENOMEM; btf->kfunc_set_tab = tab; } set = tab->sets[hook]; /* Warn when register_btf_kfunc_id_set is called twice for the same hook * for module sets. */ if (WARN_ON_ONCE(set && !vmlinux_set)) { ret = -EINVAL; goto end; } /* In case of vmlinux sets, there may be more than one set being * registered per hook. To create a unified set, we allocate a new set * and concatenate all individual sets being registered. While each set * is individually sorted, they may become unsorted when concatenated, * hence re-sorting the final set again is required to make binary * searching the set using btf_id_set8_contains function work. * * For module sets, we need to allocate as we may need to relocate * BTF ids. */ set_cnt = set ? set->cnt : 0; if (set_cnt > U32_MAX - add_set->cnt) { ret = -EOVERFLOW; goto end; } if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) { ret = -E2BIG; goto end; } /* Grow set */ set = krealloc(tab->sets[hook], offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]), GFP_KERNEL | __GFP_NOWARN); if (!set) { ret = -ENOMEM; goto end; } /* For newly allocated set, initialize set->cnt to 0 */ if (!tab->sets[hook]) set->cnt = 0; tab->sets[hook] = set; /* Concatenate the two sets */ memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0])); /* Now that the set is copied, update with relocated BTF ids */ for (i = set->cnt; i < set->cnt + add_set->cnt; i++) set->pairs[i].id = btf_relocate_id(btf, set->pairs[i].id); set->cnt += add_set->cnt; sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL); if (add_filter) { hook_filter = &tab->hook_filters[hook]; hook_filter->filters[hook_filter->nr_filters++] = kset->filter; } return 0; end: btf_free_kfunc_set_tab(btf); return ret; } static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, enum btf_kfunc_hook hook, u32 kfunc_btf_id, const struct bpf_prog *prog) { struct btf_kfunc_hook_filter *hook_filter; struct btf_id_set8 *set; u32 *id, i; if (hook >= BTF_KFUNC_HOOK_MAX) return NULL; if (!btf->kfunc_set_tab) return NULL; hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; for (i = 0; i < hook_filter->nr_filters; i++) { if (hook_filter->filters[i](prog, kfunc_btf_id)) return NULL; } set = btf->kfunc_set_tab->sets[hook]; if (!set) return NULL; id = btf_id_set8_contains(set, kfunc_btf_id); if (!id) return NULL; /* The flags for BTF ID are located next to it */ return id + 1; } static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) { switch (prog_type) { case BPF_PROG_TYPE_UNSPEC: return BTF_KFUNC_HOOK_COMMON; case BPF_PROG_TYPE_XDP: return BTF_KFUNC_HOOK_XDP; case BPF_PROG_TYPE_SCHED_CLS: return BTF_KFUNC_HOOK_TC; case BPF_PROG_TYPE_STRUCT_OPS: return BTF_KFUNC_HOOK_STRUCT_OPS; case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_LSM: return BTF_KFUNC_HOOK_TRACING; case BPF_PROG_TYPE_SYSCALL: return BTF_KFUNC_HOOK_SYSCALL; case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: return BTF_KFUNC_HOOK_CGROUP; case BPF_PROG_TYPE_SCHED_ACT: return BTF_KFUNC_HOOK_SCHED_ACT; case BPF_PROG_TYPE_SK_SKB: return BTF_KFUNC_HOOK_SK_SKB; case BPF_PROG_TYPE_SOCKET_FILTER: return BTF_KFUNC_HOOK_SOCKET_FILTER; case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: return BTF_KFUNC_HOOK_LWT; case BPF_PROG_TYPE_NETFILTER: return BTF_KFUNC_HOOK_NETFILTER; case BPF_PROG_TYPE_KPROBE: return BTF_KFUNC_HOOK_KPROBE; default: return BTF_KFUNC_HOOK_MAX; } } /* Caution: * Reference to the module (obtained using btf_try_get_module) corresponding to * the struct btf *MUST* be held when calling this function from verifier * context. This is usually true as we stash references in prog's kfunc_btf_tab; * keeping the reference for the duration of the call provides the necessary * protection for looking up a well-formed btf->kfunc_set_tab. */ u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { enum bpf_prog_type prog_type = resolve_prog_type(prog); enum btf_kfunc_hook hook; u32 *kfunc_flags; kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog); if (kfunc_flags) return kfunc_flags; hook = bpf_prog_type_to_kfunc_hook(prog_type); return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog); } u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog); } static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, const struct btf_kfunc_id_set *kset) { struct btf *btf; int ret, i; btf = btf_get_module_btf(kset->owner); if (!btf) return check_btf_kconfigs(kset->owner, "kfunc"); if (IS_ERR(btf)) return PTR_ERR(btf); for (i = 0; i < kset->set->cnt; i++) { ret = btf_check_kfunc_protos(btf, btf_relocate_id(btf, kset->set->pairs[i].id), kset->set->pairs[i].flags); if (ret) goto err_out; } ret = btf_populate_kfunc_set(btf, hook, kset); err_out: btf_put(btf); return ret; } /* This function must be invoked only from initcalls/module init functions */ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *kset) { enum btf_kfunc_hook hook; /* All kfuncs need to be tagged as such in BTF. * WARN() for initcall registrations that do not check errors. */ if (!(kset->set->flags & BTF_SET8_KFUNCS)) { WARN_ON(!kset->owner); return -EINVAL; } hook = bpf_prog_type_to_kfunc_hook(prog_type); return __register_btf_kfunc_id_set(hook, kset); } EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); /* This function must be invoked only from initcalls/module init functions */ int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset) { return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset); } EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set); s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) { struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; struct btf_id_dtor_kfunc *dtor; if (!tab) return -ENOENT; /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func. */ BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0); dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func); if (!dtor) return -ENOENT; return dtor->kfunc_btf_id; } static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt) { const struct btf_type *dtor_func, *dtor_func_proto, *t; const struct btf_param *args; s32 dtor_btf_id; u32 nr_args, i; for (i = 0; i < cnt; i++) { dtor_btf_id = btf_relocate_id(btf, dtors[i].kfunc_btf_id); dtor_func = btf_type_by_id(btf, dtor_btf_id); if (!dtor_func || !btf_type_is_func(dtor_func)) return -EINVAL; dtor_func_proto = btf_type_by_id(btf, dtor_func->type); if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto)) return -EINVAL; /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */ t = btf_type_by_id(btf, dtor_func_proto->type); if (!t || !btf_type_is_void(t)) return -EINVAL; nr_args = btf_type_vlen(dtor_func_proto); if (nr_args != 1) return -EINVAL; args = btf_params(dtor_func_proto); t = btf_type_by_id(btf, args[0].type); /* Allow any pointer type, as width on targets Linux supports * will be same for all pointer types (i.e. sizeof(void *)) */ if (!t || !btf_type_is_ptr(t)) return -EINVAL; } return 0; } /* This function must be invoked only from initcalls/module init functions */ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner) { struct btf_id_dtor_kfunc_tab *tab; struct btf *btf; u32 tab_cnt, i; int ret; btf = btf_get_module_btf(owner); if (!btf) return check_btf_kconfigs(owner, "dtor kfuncs"); if (IS_ERR(btf)) return PTR_ERR(btf); if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) { pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT); ret = -E2BIG; goto end; } /* Ensure that the prototype of dtor kfuncs being registered is sane */ ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt); if (ret < 0) goto end; tab = btf->dtor_kfunc_tab; /* Only one call allowed for modules */ if (WARN_ON_ONCE(tab && btf_is_module(btf))) { ret = -EINVAL; goto end; } tab_cnt = tab ? tab->cnt : 0; if (tab_cnt > U32_MAX - add_cnt) { ret = -EOVERFLOW; goto end; } if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) { pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT); ret = -E2BIG; goto end; } tab = krealloc(btf->dtor_kfunc_tab, offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]), GFP_KERNEL | __GFP_NOWARN); if (!tab) { ret = -ENOMEM; goto end; } if (!btf->dtor_kfunc_tab) tab->cnt = 0; btf->dtor_kfunc_tab = tab; memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0])); /* remap BTF ids based on BTF relocation (if any) */ for (i = tab_cnt; i < tab_cnt + add_cnt; i++) { tab->dtors[i].btf_id = btf_relocate_id(btf, tab->dtors[i].btf_id); tab->dtors[i].kfunc_btf_id = btf_relocate_id(btf, tab->dtors[i].kfunc_btf_id); } tab->cnt += add_cnt; sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); end: if (ret) btf_free_dtor_kfunc_tab(btf); btf_put(btf); return ret; } EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs); #define MAX_TYPES_ARE_COMPAT_DEPTH 2 /* Check local and target types for compatibility. This check is used for * type-based CO-RE relocations and follow slightly different rules than * field-based relocations. This function assumes that root types were already * checked for name match. Beyond that initial root-level name check, names * are completely ignored. Compatibility rules are as follows: * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but * kind should match for local and target types (i.e., STRUCT is not * compatible with UNION); * - for ENUMs/ENUM64s, the size is ignored; * - for INT, size and signedness are ignored; * - for ARRAY, dimensionality is ignored, element types are checked for * compatibility recursively; * - CONST/VOLATILE/RESTRICT modifiers are ignored; * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible; * - FUNC_PROTOs are compatible if they have compatible signature: same * number of input args and compatible return and argument types. * These rules are not set in stone and probably will be adjusted as we get * more experience with using BPF CO-RE relocations. */ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, MAX_TYPES_ARE_COMPAT_DEPTH); } #define MAX_TYPES_MATCH_DEPTH 2 int bpf_core_types_match(const struct btf *local_btf, u32 local_id, const struct btf *targ_btf, u32 targ_id) { return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, MAX_TYPES_MATCH_DEPTH); } static bool bpf_core_is_flavor_sep(const char *s) { /* check X___Y name pattern, where X and Y are not underscores */ return s[0] != '_' && /* X */ s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */ s[4] != '_'; /* Y */ } size_t bpf_core_essential_name_len(const char *name) { size_t n = strlen(name); int i; for (i = n - 5; i >= 0; i--) { if (bpf_core_is_flavor_sep(name + i)) return i + 1; } return n; } static void bpf_free_cands(struct bpf_cand_cache *cands) { if (!cands->cnt) /* empty candidate array was allocated on stack */ return; kfree(cands); } static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands) { kfree(cands->name); kfree(cands); } #define VMLINUX_CAND_CACHE_SIZE 31 static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE]; #define MODULE_CAND_CACHE_SIZE 31 static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE]; static void __print_cand_cache(struct bpf_verifier_log *log, struct bpf_cand_cache **cache, int cache_size) { struct bpf_cand_cache *cc; int i, j; for (i = 0; i < cache_size; i++) { cc = cache[i]; if (!cc) continue; bpf_log(log, "[%d]%s(", i, cc->name); for (j = 0; j < cc->cnt; j++) { bpf_log(log, "%d", cc->cands[j].id); if (j < cc->cnt - 1) bpf_log(log, " "); } bpf_log(log, "), "); } } static void print_cand_cache(struct bpf_verifier_log *log) { mutex_lock(&cand_cache_mutex); bpf_log(log, "vmlinux_cand_cache:"); __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); bpf_log(log, "\nmodule_cand_cache:"); __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE); bpf_log(log, "\n"); mutex_unlock(&cand_cache_mutex); } static u32 hash_cands(struct bpf_cand_cache *cands) { return jhash(cands->name, cands->name_len, 0); } static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands, struct bpf_cand_cache **cache, int cache_size) { struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size]; if (cc && cc->name_len == cands->name_len && !strncmp(cc->name, cands->name, cands->name_len)) return cc; return NULL; } static size_t sizeof_cands(int cnt) { return offsetof(struct bpf_cand_cache, cands[cnt]); } static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, struct bpf_cand_cache **cache, int cache_size) { struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands; if (*cc) { bpf_free_cands_from_cache(*cc); *cc = NULL; } new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL); if (!new_cands) { bpf_free_cands(cands); return ERR_PTR(-ENOMEM); } /* strdup the name, since it will stay in cache. * the cands->name points to strings in prog's BTF and the prog can be unloaded. */ new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL); bpf_free_cands(cands); if (!new_cands->name) { kfree(new_cands); return ERR_PTR(-ENOMEM); } *cc = new_cands; return new_cands; } #ifdef CONFIG_DEBUG_INFO_BTF_MODULES static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache, int cache_size) { struct bpf_cand_cache *cc; int i, j; for (i = 0; i < cache_size; i++) { cc = cache[i]; if (!cc) continue; if (!btf) { /* when new module is loaded purge all of module_cand_cache, * since new module might have candidates with the name * that matches cached cands. */ bpf_free_cands_from_cache(cc); cache[i] = NULL; continue; } /* when module is unloaded purge cache entries * that match module's btf */ for (j = 0; j < cc->cnt; j++) if (cc->cands[j].btf == btf) { bpf_free_cands_from_cache(cc); cache[i] = NULL; break; } } } static void purge_cand_cache(struct btf *btf) { mutex_lock(&cand_cache_mutex); __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE); mutex_unlock(&cand_cache_mutex); } #endif static struct bpf_cand_cache * bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, int targ_start_id) { struct bpf_cand_cache *new_cands; const struct btf_type *t; const char *targ_name; size_t targ_essent_len; int n, i; n = btf_nr_types(targ_btf); for (i = targ_start_id; i < n; i++) { t = btf_type_by_id(targ_btf, i); if (btf_kind(t) != cands->kind) continue; targ_name = btf_name_by_offset(targ_btf, t->name_off); if (!targ_name) continue; /* the resched point is before strncmp to make sure that search * for non-existing name will have a chance to schedule(). */ cond_resched(); if (strncmp(cands->name, targ_name, cands->name_len) != 0) continue; targ_essent_len = bpf_core_essential_name_len(targ_name); if (targ_essent_len != cands->name_len) continue; /* most of the time there is only one candidate for a given kind+name pair */ new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL); if (!new_cands) { bpf_free_cands(cands); return ERR_PTR(-ENOMEM); } memcpy(new_cands, cands, sizeof_cands(cands->cnt)); bpf_free_cands(cands); cands = new_cands; cands->cands[cands->cnt].btf = targ_btf; cands->cands[cands->cnt].id = i; cands->cnt++; } return cands; } static struct bpf_cand_cache * bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) { struct bpf_cand_cache *cands, *cc, local_cand = {}; const struct btf *local_btf = ctx->btf; const struct btf_type *local_type; const struct btf *main_btf; size_t local_essent_len; struct btf *mod_btf; const char *name; int id; main_btf = bpf_get_btf_vmlinux(); if (IS_ERR(main_btf)) return ERR_CAST(main_btf); if (!main_btf) return ERR_PTR(-EINVAL); local_type = btf_type_by_id(local_btf, local_type_id); if (!local_type) return ERR_PTR(-EINVAL); name = btf_name_by_offset(local_btf, local_type->name_off); if (str_is_empty(name)) return ERR_PTR(-EINVAL); local_essent_len = bpf_core_essential_name_len(name); cands = &local_cand; cands->name = name; cands->kind = btf_kind(local_type); cands->name_len = local_essent_len; cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); /* cands is a pointer to stack here */ if (cc) { if (cc->cnt) return cc; goto check_modules; } /* Attempt to find target candidates in vmlinux BTF first */ cands = bpf_core_add_cands(cands, main_btf, 1); if (IS_ERR(cands)) return ERR_CAST(cands); /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */ /* populate cache even when cands->cnt == 0 */ cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); if (IS_ERR(cc)) return ERR_CAST(cc); /* if vmlinux BTF has any candidate, don't go for module BTFs */ if (cc->cnt) return cc; check_modules: /* cands is a pointer to stack here and cands->cnt == 0 */ cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); if (cc) /* if cache has it return it even if cc->cnt == 0 */ return cc; /* If candidate is not found in vmlinux's BTF then search in module's BTFs */ spin_lock_bh(&btf_idr_lock); idr_for_each_entry(&btf_idr, mod_btf, id) { if (!btf_is_module(mod_btf)) continue; /* linear search could be slow hence unlock/lock * the IDR to avoiding holding it for too long */ btf_get(mod_btf); spin_unlock_bh(&btf_idr_lock); cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); btf_put(mod_btf); if (IS_ERR(cands)) return ERR_CAST(cands); spin_lock_bh(&btf_idr_lock); } spin_unlock_bh(&btf_idr_lock); /* cands is a pointer to kmalloced memory here if cands->cnt > 0 * or pointer to stack if cands->cnd == 0. * Copy it into the cache even when cands->cnt == 0 and * return the result. */ return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); } int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn) { bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL; struct bpf_core_cand_list cands = {}; struct bpf_core_relo_res targ_res; struct bpf_core_spec *specs; const struct btf_type *type; int err; /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5" * into arrays of btf_ids of struct fields and array indices. */ specs = kcalloc(3, sizeof(*specs), GFP_KERNEL); if (!specs) return -ENOMEM; type = btf_type_by_id(ctx->btf, relo->type_id); if (!type) { bpf_log(ctx->log, "relo #%u: bad type id %u\n", relo_idx, relo->type_id); kfree(specs); return -EINVAL; } if (need_cands) { struct bpf_cand_cache *cc; int i; mutex_lock(&cand_cache_mutex); cc = bpf_core_find_cands(ctx, relo->type_id); if (IS_ERR(cc)) { bpf_log(ctx->log, "target candidate search failed for %d\n", relo->type_id); err = PTR_ERR(cc); goto out; } if (cc->cnt) { cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL); if (!cands.cands) { err = -ENOMEM; goto out; } } for (i = 0; i < cc->cnt; i++) { bpf_log(ctx->log, "CO-RE relocating %s %s: found target candidate [%d]\n", btf_kind_str[cc->kind], cc->name, cc->cands[i].id); cands.cands[i].btf = cc->cands[i].btf; cands.cands[i].id = cc->cands[i].id; } cands.len = cc->cnt; /* cand_cache_mutex needs to span the cache lookup and * copy of btf pointer into bpf_core_cand_list, * since module can be unloaded while bpf_core_calc_relo_insn * is working with module's btf. */ } err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs, &targ_res); if (err) goto out; err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx, &targ_res); out: kfree(specs); if (need_cands) { kfree(cands.cands); mutex_unlock(&cand_cache_mutex); if (ctx->log->level & BPF_LOG_LEVEL2) print_cand_cache(ctx->log); } return err; } bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, const char *field_name, u32 btf_id, const char *suffix) { struct btf *btf = reg->btf; const struct btf_type *walk_type, *safe_type; const char *tname; char safe_tname[64]; long ret, safe_id; const struct btf_member *member; u32 i; walk_type = btf_type_by_id(btf, reg->btf_id); if (!walk_type) return false; tname = btf_name_by_offset(btf, walk_type->name_off); ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix); if (ret >= sizeof(safe_tname)) return false; safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info)); if (safe_id < 0) return false; safe_type = btf_type_by_id(btf, safe_id); if (!safe_type) return false; for_each_member(i, safe_type, member) { const char *m_name = __btf_name_by_offset(btf, member->name_off); const struct btf_type *mtype = btf_type_by_id(btf, member->type); u32 id; if (!btf_type_is_ptr(mtype)) continue; btf_type_skip_modifiers(btf, mtype->type, &id); /* If we match on both type and name, the field is considered trusted. */ if (btf_id == id && !strcmp(field_name, m_name)) return true; } return false; } bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, const struct btf *reg_btf, u32 reg_id, const struct btf *arg_btf, u32 arg_id) { const char *reg_name, *arg_name, *search_needle; const struct btf_type *reg_type, *arg_type; int reg_len, arg_len, cmp_len; size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char); reg_type = btf_type_by_id(reg_btf, reg_id); if (!reg_type) return false; arg_type = btf_type_by_id(arg_btf, arg_id); if (!arg_type) return false; reg_name = btf_name_by_offset(reg_btf, reg_type->name_off); arg_name = btf_name_by_offset(arg_btf, arg_type->name_off); reg_len = strlen(reg_name); arg_len = strlen(arg_name); /* Exactly one of the two type names may be suffixed with ___init, so * if the strings are the same size, they can't possibly be no-cast * aliases of one another. If you have two of the same type names, e.g. * they're both nf_conn___init, it would be improper to return true * because they are _not_ no-cast aliases, they are the same type. */ if (reg_len == arg_len) return false; /* Either of the two names must be the other name, suffixed with ___init. */ if ((reg_len != arg_len + pattern_len) && (arg_len != reg_len + pattern_len)) return false; if (reg_len < arg_len) { search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX); cmp_len = reg_len; } else { search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX); cmp_len = arg_len; } if (!search_needle) return false; /* ___init suffix must come at the end of the name */ if (*(search_needle + pattern_len) != '\0') return false; return !strncmp(reg_name, arg_name, cmp_len); } #ifdef CONFIG_BPF_JIT static int btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, struct bpf_verifier_log *log) { struct btf_struct_ops_tab *tab, *new_tab; int i, err; tab = btf->struct_ops_tab; if (!tab) { tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]), GFP_KERNEL); if (!tab) return -ENOMEM; tab->capacity = 4; btf->struct_ops_tab = tab; } for (i = 0; i < tab->cnt; i++) if (tab->ops[i].st_ops == st_ops) return -EEXIST; if (tab->cnt == tab->capacity) { new_tab = krealloc(tab, offsetof(struct btf_struct_ops_tab, ops[tab->capacity * 2]), GFP_KERNEL); if (!new_tab) return -ENOMEM; tab = new_tab; tab->capacity *= 2; btf->struct_ops_tab = tab; } tab->ops[btf->struct_ops_tab->cnt].st_ops = st_ops; err = bpf_struct_ops_desc_init(&tab->ops[btf->struct_ops_tab->cnt], btf, log); if (err) return err; btf->struct_ops_tab->cnt++; return 0; } const struct bpf_struct_ops_desc * bpf_struct_ops_find_value(struct btf *btf, u32 value_id) { const struct bpf_struct_ops_desc *st_ops_list; unsigned int i; u32 cnt; if (!value_id) return NULL; if (!btf->struct_ops_tab) return NULL; cnt = btf->struct_ops_tab->cnt; st_ops_list = btf->struct_ops_tab->ops; for (i = 0; i < cnt; i++) { if (st_ops_list[i].value_id == value_id) return &st_ops_list[i]; } return NULL; } const struct bpf_struct_ops_desc * bpf_struct_ops_find(struct btf *btf, u32 type_id) { const struct bpf_struct_ops_desc *st_ops_list; unsigned int i; u32 cnt; if (!type_id) return NULL; if (!btf->struct_ops_tab) return NULL; cnt = btf->struct_ops_tab->cnt; st_ops_list = btf->struct_ops_tab->ops; for (i = 0; i < cnt; i++) { if (st_ops_list[i].type_id == type_id) return &st_ops_list[i]; } return NULL; } int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops) { struct bpf_verifier_log *log; struct btf *btf; int err = 0; btf = btf_get_module_btf(st_ops->owner); if (!btf) return check_btf_kconfigs(st_ops->owner, "struct_ops"); if (IS_ERR(btf)) return PTR_ERR(btf); log = kzalloc(sizeof(*log), GFP_KERNEL | __GFP_NOWARN); if (!log) { err = -ENOMEM; goto errout; } log->level = BPF_LOG_KERNEL; err = btf_add_struct_ops(btf, st_ops, log); errout: kfree(log); btf_put(btf); return err; } EXPORT_SYMBOL_GPL(__register_bpf_struct_ops); #endif bool btf_param_match_suffix(const struct btf *btf, const struct btf_param *arg, const char *suffix) { int suffix_len = strlen(suffix), len; const char *param_name; /* In the future, this can be ported to use BTF tagging */ param_name = btf_name_by_offset(btf, arg->name_off); if (str_is_empty(param_name)) return false; len = strlen(param_name); if (len <= suffix_len) return false; param_name += len - suffix_len; return !strncmp(param_name, suffix, suffix_len); }
54 54 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGETLB_H #define _LINUX_HUGETLB_H #include <linux/mm.h> #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/fs.h> #include <linux/hugetlb_inline.h> #include <linux/cgroup.h> #include <linux/page_ref.h> #include <linux/list.h> #include <linux/kref.h> #include <linux/pgtable.h> #include <linux/gfp.h> #include <linux/userfaultfd_k.h> struct ctl_table; struct user_struct; struct mmu_gather; struct node; void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE #include <linux/pagemap.h> #include <linux/shm.h> #include <asm/tlbflush.h> /* * For HugeTLB page, there are more metadata to save in the struct page. But * the head struct page cannot meet our needs, so we have to abuse other tail * struct page to store the metadata. */ #define __NR_USED_SUBPAGE 3 struct hugepage_subpool { spinlock_t lock; long count; long max_hpages; /* Maximum huge pages or -1 if no maximum. */ long used_hpages; /* Used count against maximum, includes */ /* both allocated and reserved pages. */ struct hstate *hstate; long min_hpages; /* Minimum huge pages or -1 if no minimum. */ long rsv_hpages; /* Pages reserved against global pool to */ /* satisfy minimum size. */ }; struct resv_map { struct kref refs; spinlock_t lock; struct list_head regions; long adds_in_progress; struct list_head region_cache; long region_cache_count; struct rw_semaphore rw_sema; #ifdef CONFIG_CGROUP_HUGETLB /* * On private mappings, the counter to uncharge reservations is stored * here. If these fields are 0, then either the mapping is shared, or * cgroup accounting is disabled for this resv_map. */ struct page_counter *reservation_counter; unsigned long pages_per_hpage; struct cgroup_subsys_state *css; #endif }; /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * * The region data structures are embedded into a resv_map and protected * by a resv_map's lock. The set of regions within the resv_map represent * reservations for huge pages, or huge pages that have already been * instantiated within the map. The from and to elements are huge page * indices into the associated mapping. from indicates the starting index * of the region. to represents the first index past the end of the region. * * For example, a file region structure with from == 0 and to == 4 represents * four huge pages in a mapping. It is important to note that the to element * represents the first element past the end of the region. This is used in * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. * * Interval notation of the form [from, to) will be used to indicate that * the endpoint from is inclusive and to is exclusive. */ struct file_region { struct list_head link; long from; long to; #ifdef CONFIG_CGROUP_HUGETLB /* * On shared mappings, each reserved region appears as a struct * file_region in resv_map. These fields hold the info needed to * uncharge each reservation. */ struct page_counter *reservation_counter; struct cgroup_subsys_state *css; #endif }; struct hugetlb_vma_lock { struct kref refs; struct rw_semaphore rw_sema; struct vm_area_struct *vma; }; extern struct resv_map *resv_map_alloc(void); void resv_map_release(struct kref *ref); extern spinlock_t hugetlb_lock; extern int hugetlb_max_hstate __read_mostly; #define for_each_hstate(h) \ for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages); void hugepage_put_subpool(struct hugepage_subpool *spool); void hugetlb_dup_vma_private(struct vm_area_struct *vma); void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); void hugetlb_show_meminfo_node(int nid); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); #ifdef CONFIG_USERFAULTFD int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void folio_putback_active_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud); bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address); struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages[MAX_NUMNODES]; /* arch callbacks */ #ifndef CONFIG_HIGHPTE /* * pte_offset_huge() and pte_alloc_huge() are helpers for those architectures * which may go down to the lowest PTE level in their huge_pte_offset() and * huge_pte_alloc(): to avoid reliance on pte_offset_map() without pte_unmap(). */ static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address); } #endif pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); /* * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. * Returns the pte_t* if found, or NULL if the address is not mapped. * * IMPORTANT: we should normally not directly call this function, instead * this is only a common interface to implement arch-specific * walker. Please use hugetlb_walk() instead, because that will attempt to * verify the locking for you. * * Since this function will walk all the pgtable pages (including not only * high-level pgtable page, but also PUD entry that can be unshared * concurrently for VM_SHARED), the caller of this function should be * responsible of its thread safety. One can follow this rule: * * (1) For private mappings: pmd unsharing is not possible, so holding the * mmap_lock for either read or write is sufficient. Most callers * already hold the mmap_lock, so normally, no special action is * required. * * (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged * pgtable page can go away from under us! It can be done by a pmd * unshare with a follow up munmap() on the other process), then we * need either: * * (2.1) hugetlb vma lock read or write held, to make sure pmd unshare * won't happen upon the range (it also makes sure the pte_t we * read is the right and stable one), or, * * (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make * sure even if unshare happened the racy unmap() will wait until * i_mmap_rwsem is released. * * Option (2.1) is the safest, which guarantees pte stability from pmd * sharing pov, until the vma lock released. Option (2.2) doesn't protect * a concurrent pmd unshare, but it makes sure the pgtable page is safe to * access. */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); extern void __hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *begin, unsigned long *end); extern void __hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details); static inline void hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { if (is_vm_hugetlb_page(vma)) __hugetlb_zap_begin(vma, start, end); } static inline void hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details) { if (is_vm_hugetlb_page(vma)) __hugetlb_zap_end(vma, details); } void hugetlb_vma_lock_read(struct vm_area_struct *vma); void hugetlb_vma_unlock_read(struct vm_area_struct *vma); void hugetlb_vma_lock_write(struct vm_area_struct *vma); void hugetlb_vma_unlock_write(struct vm_area_struct *vma); int hugetlb_vma_trylock_write(struct vm_area_struct *vma); void hugetlb_vma_assert_locked(struct vm_area_struct *vma); void hugetlb_vma_lock_release(struct kref *kref); long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); bool is_hugetlb_entry_migration(pte_t pte); bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) { } static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma) { } static inline unsigned long hugetlb_total_pages(void) { return 0; } static inline struct address_space *hugetlb_folio_mapping_lock_write( struct folio *folio) { return NULL; } static inline int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } static inline void adjust_range_if_pmd_sharing_possible( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } static inline void hugetlb_zap_begin( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } static inline void hugetlb_zap_end( struct vm_area_struct *vma, struct zap_details *details) { } static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { BUG(); return 0; } static inline int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len) { BUG(); return 0; } static inline void hugetlb_report_meminfo(struct seq_file *m) { } static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid) { return 0; } static inline void hugetlb_show_meminfo_node(int nid) { } static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { return -EINVAL; } static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma) { } static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma) { } static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma) { } static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma) { } static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma) { return 1; } static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma) { } static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { return 0; } static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { BUG(); } #ifdef CONFIG_USERFAULTFD static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { BUG(); return 0; } #endif /* CONFIG_USERFAULTFD */ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { return NULL; } static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) { return false; } static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { return 0; } static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } static inline void folio_putback_active_hugetlb(struct folio *folio) { } static inline void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { } static inline long hugetlb_change_protection( struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { return 0; } static inline void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { BUG(); } static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { BUG(); return 0; } static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write static inline int pgd_write(pgd_t pgd) { BUG(); return 0; } #endif #define HUGETLB_ANON_FILE "anon_hugepage" enum { /* * The file will be used as an shm file so shmfs accounting rules * apply */ HUGETLB_SHMFS_INODE = 1, /* * The file is being created on the internal vfs mount and shmfs * accounting rules do not apply */ HUGETLB_ANONHUGE_INODE = 2, }; #ifdef CONFIG_HUGETLBFS struct hugetlbfs_sb_info { long max_inodes; /* inodes allowed */ long free_inodes; /* inodes free */ spinlock_t stat_lock; struct hstate *hstate; struct hugepage_subpool *spool; kuid_t uid; kgid_t gid; umode_t mode; }; static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) { return sb->s_fs_info; } struct hugetlbfs_inode_info { struct inode vfs_inode; unsigned int seals; }; static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) { return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); } extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, int creat_flags, int page_size_log); static inline bool is_file_hugepages(const struct file *file) { return file->f_op->fop_flags & FOP_HUGE_PAGES; } static inline struct hstate *hstate_inode(struct inode *i) { return HUGETLBFS_SB(i->i_sb)->hstate; } #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) false static inline struct file * hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, int creat_flags, int page_size_log) { return ERR_PTR(-ENOSYS); } static inline struct hstate *hstate_inode(struct inode *i) { return NULL; } #endif /* !CONFIG_HUGETLBFS */ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); /* * huegtlb page specific state flags. These flags are located in page.private * of the hugetlb head page. Functions created via the below macros should be * used to manipulate these flags. * * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at * allocation time. Cleared when page is fully instantiated. Free * routine checks flag to restore a reservation on error paths. * Synchronization: Examined or modified by code that knows it has * the only reference to page. i.e. After allocation but before use * or when the page is being freed. * HPG_migratable - Set after a newly allocated page is added to the page * cache and/or page tables. Indicates the page is a candidate for * migration. * Synchronization: Initially set after new page allocation with no * locking. When examined and modified during migration processing * (isolate, migrate, putback) the hugetlb_lock is held. * HPG_temporary - Set on a page that is temporarily allocated from the buddy * allocator. Typically used for migration target pages when no pages * are available in the pool. The hugetlb free page path will * immediately free pages with this flag set to the buddy allocator. * Synchronization: Can be set after huge page allocation from buddy when * code knows it has only reference. All other examinations and * modifications require hugetlb_lock. * HPG_freed - Set when page is on the free lists. * Synchronization: hugetlb_lock held for examination and modification. * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed. * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page * that is not tracked by raw_hwp_page list. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, HPG_migratable, HPG_temporary, HPG_freed, HPG_vmemmap_optimized, HPG_raw_hwp_unreliable, __NR_HPAGEFLAGS, }; /* * Macros to create test, set and clear function definitions for * hugetlb specific page flags. */ #ifdef CONFIG_HUGETLB_PAGE #define TESTHPAGEFLAG(uname, flname) \ static __always_inline \ bool folio_test_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ return test_bit(HPG_##flname, private); \ } #define SETHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_set_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ set_bit(HPG_##flname, private); \ } #define CLEARHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_clear_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ clear_bit(HPG_##flname, private); \ } #else #define TESTHPAGEFLAG(uname, flname) \ static inline bool \ folio_test_hugetlb_##flname(struct folio *folio) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ static inline void \ folio_set_hugetlb_##flname(struct folio *folio) \ { } #define CLEARHPAGEFLAG(uname, flname) \ static inline void \ folio_clear_hugetlb_##flname(struct folio *folio) \ { } #endif #define HPAGEFLAG(uname, flname) \ TESTHPAGEFLAG(uname, flname) \ SETHPAGEFLAG(uname, flname) \ CLEARHPAGEFLAG(uname, flname) \ /* * Create functions associated with hugetlb page flags */ HPAGEFLAG(RestoreReserve, restore_reserve) HPAGEFLAG(Migratable, migratable) HPAGEFLAG(Temporary, temporary) HPAGEFLAG(Freed, freed) HPAGEFLAG(VmemmapOptimized, vmemmap_optimized) HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) #ifdef CONFIG_HUGETLB_PAGE #define HSTATE_NAME_LEN 32 /* Defines one hugetlb page size */ struct hstate { struct mutex resize_lock; struct lock_class_key resize_key; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; unsigned int demote_order; unsigned long mask; unsigned long max_huge_pages; unsigned long nr_huge_pages; unsigned long free_huge_pages; unsigned long resv_huge_pages; unsigned long surplus_huge_pages; unsigned long nr_overcommit_huge_pages; struct list_head hugepage_activelist; struct list_head hugepage_freelists[MAX_NUMNODES]; unsigned int max_huge_pages_node[MAX_NUMNODES]; unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; char name[HSTATE_NAME_LEN]; }; struct huge_bootmem_page { struct list_head list; struct hstate *hstate; }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback); struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); int __init alloc_bootmem_huge_page(struct hstate *h, int nid); bool __init hugetlb_node_alloc_supported(void); void __init hugetlb_add_hstate(unsigned order); bool __init arch_hugetlb_valid_size(unsigned long size); struct hstate *size_to_hstate(unsigned long size); #ifndef HUGE_MAX_HSTATE #define HUGE_MAX_HSTATE 1 #endif extern struct hstate hstates[HUGE_MAX_HSTATE]; extern unsigned int default_hstate_idx; #define default_hstate (hstates[default_hstate_idx]) static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { return folio->_hugetlb_subpool; } static inline void hugetlb_set_folio_subpool(struct folio *folio, struct hugepage_subpool *subpool) { folio->_hugetlb_subpool = subpool; } static inline struct hstate *hstate_file(struct file *f) { return hstate_inode(file_inode(f)); } static inline struct hstate *hstate_sizelog(int page_size_log) { if (!page_size_log) return &default_hstate; if (page_size_log < BITS_PER_LONG) return size_to_hstate(1UL << page_size_log); return NULL; } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) { return hstate_file(vma->vm_file); } static inline unsigned long huge_page_size(const struct hstate *h) { return (unsigned long)PAGE_SIZE << h->order; } extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); static inline unsigned long huge_page_mask(struct hstate *h) { return h->mask; } static inline unsigned int huge_page_order(struct hstate *h) { return h->order; } static inline unsigned huge_page_shift(struct hstate *h) { return h->order + PAGE_SHIFT; } static inline bool hstate_is_gigantic(struct hstate *h) { return huge_page_order(h) > MAX_PAGE_ORDER; } static inline unsigned int pages_per_huge_page(const struct hstate *h) { return 1 << h->order; } static inline unsigned int blocks_per_huge_page(struct hstate *h) { return huge_page_size(h) / 512; } static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, struct address_space *mapping, pgoff_t idx) { return filemap_lock_folio(mapping, idx << huge_page_order(h)); } #include <asm/hugetlb.h> #ifndef is_hugepage_only_range static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { return 0; } #define is_hugepage_only_range is_hugepage_only_range #endif #ifndef arch_clear_hugetlb_flags static inline void arch_clear_hugetlb_flags(struct folio *folio) { } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags #endif #ifndef arch_make_huge_pte static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { return pte_mkhuge(entry); } #endif static inline struct hstate *folio_hstate(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); return size_to_hstate(folio_size(folio)); } static inline unsigned hstate_index_to_shift(unsigned index) { return hstates[index].order + PAGE_SHIFT; } static inline int hstate_index(struct hstate *h) { return h - hstates; } int dissolve_free_hugetlb_folio(struct folio *folio); int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn); #ifdef CONFIG_MEMORY_FAILURE extern void folio_clear_hugetlb_hwpoison(struct folio *folio); #else static inline void folio_clear_hugetlb_hwpoison(struct folio *folio) { } #endif #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION #ifndef arch_hugetlb_migration_supported static inline bool arch_hugetlb_migration_supported(struct hstate *h) { if ((huge_page_shift(h) == PMD_SHIFT) || (huge_page_shift(h) == PUD_SHIFT) || (huge_page_shift(h) == PGDIR_SHIFT)) return true; else return false; } #endif #else static inline bool arch_hugetlb_migration_supported(struct hstate *h) { return false; } #endif static inline bool hugepage_migration_supported(struct hstate *h) { return arch_hugetlb_migration_supported(h); } /* * Movability check is different as compared to migration check. * It determines whether or not a huge page should be placed on * movable zone or not. Movability of any huge page should be * required only if huge page size is supported for migration. * There won't be any reason for the huge page to be movable if * it is not migratable to start with. Also the size of the huge * page should be large enough to be placed under a movable zone * and still feasible enough to be migratable. Just the presence * in movable zone does not make the migration feasible. * * So even though large huge page sizes like the gigantic ones * are migratable they should not be movable because its not * feasible to migrate them from movable zone. */ static inline bool hugepage_movable_supported(struct hstate *h) { if (!hugepage_migration_supported(h)) return false; if (hstate_is_gigantic(h)) return false; return true; } /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { gfp_t gfp = __GFP_COMP | __GFP_NOWARN; gfp |= hugepage_movable_supported(h) ? GFP_HIGHUSER_MOVABLE : GFP_HIGHUSER; return gfp; } static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) { gfp_t modified_mask = htlb_alloc_mask(h); /* Some callers might want to enforce node */ modified_mask |= (gfp_mask & __GFP_THISNODE); modified_mask |= (gfp_mask & __GFP_NOWARN); return modified_mask; } static inline bool htlb_allow_alloc_fallback(int reason) { bool allowed_fallback = false; /* * Note: the memory offline, memory failure and migration syscalls will * be allowed to fallback to other nodes due to lack of a better chioce, * that might break the per-node hugetlb pool. While other cases will * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool. */ switch (reason) { case MR_MEMORY_HOTPLUG: case MR_MEMORY_FAILURE: case MR_SYSCALL: case MR_MEMPOLICY_MBIND: allowed_fallback = true; break; default: break; } return allowed_fallback; } static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { const unsigned long size = huge_page_size(h); VM_WARN_ON(size == PAGE_SIZE); /* * hugetlb must use the exact same PT locks as core-mm page table * walkers would. When modifying a PTE table, hugetlb must take the * PTE PT lock, when modifying a PMD table, hugetlb must take the PMD * PT lock etc. * * The expectation is that any hugetlb folio smaller than a PMD is * always mapped into a single PTE table and that any hugetlb folio * smaller than a PUD (but at least as big as a PMD) is always mapped * into a single PMD table. * * If that does not hold for an architecture, then that architecture * must disable split PT locks such that all *_lockptr() functions * will give us the same result: the per-MM PT lock. * * Note that with e.g., CONFIG_PGTABLE_LEVELS=2 where * PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE, we'd use pud_lockptr() * and core-mm would use pmd_lockptr(). However, in such configurations * split PMD locks are disabled -- they don't make sense on a single * PGDIR page table -- and the end result is the same. */ if (size >= PUD_SIZE) return pud_lockptr(mm, (pud_t *) pte); else if (size >= PMD_SIZE || IS_ENABLED(CONFIG_HIGHPTE)) return pmd_lockptr(mm, (pmd_t *) pte); /* pte_alloc_huge() only applies with !CONFIG_HIGHPTE */ return ptep_lockptr(mm, pte); } #ifndef hugepages_supported /* * Some platform decide whether they support huge pages at boot * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0 * when there is no such support */ #define hugepages_supported() (HPAGE_SHIFT != 0) #endif void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); static inline void hugetlb_count_init(struct mm_struct *mm) { atomic_long_set(&mm->hugetlb_usage, 0); } static inline void hugetlb_count_add(long l, struct mm_struct *mm) { atomic_long_add(l, &mm->hugetlb_usage); } static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { atomic_long_sub(l, &mm->hugetlb_usage); } #ifndef huge_ptep_modify_prot_start #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); } #endif #ifndef huge_ptep_modify_prot_commit #define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { unsigned long psize = huge_page_size(hstate_vma(vma)); set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } #endif #ifdef CONFIG_NUMA void hugetlb_register_node(struct node *node); void hugetlb_unregister_node(struct node *node); #endif /* * Check if a given raw @page in a hugepage is HWPOISON. */ bool is_raw_hwpoison_page_in_hugepage(struct page *page); static inline unsigned long huge_page_mask_align(struct file *file) { return PAGE_MASK & ~huge_page_mask(hstate_file(file)); } #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; static inline unsigned long huge_page_mask_align(struct file *file) { return 0; } static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { return NULL; } static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, struct address_space *mapping, pgoff_t idx) { return NULL; } static inline int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { return -ENOMEM; } static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { return NULL; } static inline struct folio * alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { return NULL; } static inline struct folio * alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { return NULL; } static inline int __alloc_bootmem_huge_page(struct hstate *h) { return 0; } static inline struct hstate *hstate_file(struct file *f) { return NULL; } static inline struct hstate *hstate_sizelog(int page_size_log) { return NULL; } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) { return NULL; } static inline struct hstate *folio_hstate(struct folio *folio) { return NULL; } static inline struct hstate *size_to_hstate(unsigned long size) { return NULL; } static inline unsigned long huge_page_size(struct hstate *h) { return PAGE_SIZE; } static inline unsigned long huge_page_mask(struct hstate *h) { return PAGE_MASK; } static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } static inline unsigned int huge_page_order(struct hstate *h) { return 0; } static inline unsigned int huge_page_shift(struct hstate *h) { return PAGE_SHIFT; } static inline bool hstate_is_gigantic(struct hstate *h) { return false; } static inline unsigned int pages_per_huge_page(struct hstate *h) { return 1; } static inline unsigned hstate_index_to_shift(unsigned index) { return 0; } static inline int hstate_index(struct hstate *h) { return 0; } static inline int dissolve_free_hugetlb_folio(struct folio *folio) { return 0; } static inline int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn) { return 0; } static inline bool hugepage_migration_supported(struct hstate *h) { return false; } static inline bool hugepage_movable_supported(struct hstate *h) { return false; } static inline gfp_t htlb_alloc_mask(struct hstate *h) { return 0; } static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) { return 0; } static inline bool htlb_allow_alloc_fallback(int reason) { return false; } static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { return &mm->page_table_lock; } static inline void hugetlb_count_init(struct mm_struct *mm) { } static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) { } static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { } static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_MMU return ptep_get(ptep); #else return *ptep; #endif } static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { } static inline void hugetlb_register_node(struct node *node) { } static inline void hugetlb_unregister_node(struct node *node) { } static inline bool hugetlbfs_pagecache_present( struct hstate *h, struct vm_area_struct *vma, unsigned long address) { return false; } #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { spinlock_t *ptl; ptl = huge_pte_lockptr(h, mm, pte); spin_lock(ptl); return ptl; } #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) extern void __init hugetlb_cma_reserve(int order); #else static inline __init void hugetlb_cma_reserve(int order) { } #endif #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static inline bool hugetlb_pmd_shared(pte_t *pte) { return page_count(virt_to_page(pte)) > 1; } #else static inline bool hugetlb_pmd_shared(pte_t *pte) { return false; } #endif bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE /* * ARCHes with special requirements for evicting HUGETLB backing TLB entries can * implement this. */ #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #endif static inline bool __vma_shareable_lock(struct vm_area_struct *vma) { return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; } bool __vma_private_lock(struct vm_area_struct *vma); /* * Safe version of huge_pte_offset() to check the locks. See comments * above huge_pte_offset(). */ static inline pte_t * hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { #if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; /* * If pmd sharing possible, locking needed to safely walk the * hugetlb pgtables. More information can be found at the comment * above huge_pte_offset() in the same file. * * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP. */ if (__vma_shareable_lock(vma)) WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) && !lockdep_is_held( &vma->vm_file->f_mapping->i_mmap_rwsem)); #endif return huge_pte_offset(vma->vm_mm, addr, sz); } #endif /* _LINUX_HUGETLB_H */
5 30 23 4 19 12 1 3 3 33 32 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "main.h" #include <linux/array_size.h> #include <linux/atomic.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/crc32c.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kobject.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> #include <net/dsfield.h> #include <net/genetlink.h> #include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bat_iv_ogm.h" #include "bat_v.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "gateway_common.h" #include "hard-interface.h" #include "log.h" #include "multicast.h" #include "netlink.h" #include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" #include "soft-interface.h" #include "tp_meter.h" #include "translation-table.h" /* List manipulations on hardif_list have to be rtnl_lock()'ed, * list traversals just rcu-locked */ struct list_head batadv_hardif_list; unsigned int batadv_hardif_generation; static int (*batadv_rx_handler[256])(struct sk_buff *skb, struct batadv_hard_iface *recv_if); unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; struct workqueue_struct *batadv_event_workqueue; static void batadv_recv_handler_init(void); #define BATADV_UEV_TYPE_VAR "BATTYPE=" #define BATADV_UEV_ACTION_VAR "BATACTION=" #define BATADV_UEV_DATA_VAR "BATDATA=" static char *batadv_uev_action_str[] = { "add", "del", "change", "loopdetect", }; static char *batadv_uev_type_str[] = { "gw", "bla", }; static int __init batadv_init(void) { int ret; ret = batadv_tt_cache_init(); if (ret < 0) return ret; INIT_LIST_HEAD(&batadv_hardif_list); batadv_algo_init(); batadv_recv_handler_init(); batadv_v_init(); batadv_iv_init(); batadv_nc_init(); batadv_tp_meter_init(); batadv_event_workqueue = create_singlethread_workqueue("bat_events"); if (!batadv_event_workqueue) goto err_create_wq; register_netdevice_notifier(&batadv_hard_if_notifier); rtnl_link_register(&batadv_link_ops); batadv_netlink_register(); pr_info("B.A.T.M.A.N. advanced %s (compatibility version %i) loaded\n", BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION); return 0; err_create_wq: batadv_tt_cache_destroy(); return -ENOMEM; } static void __exit batadv_exit(void) { batadv_netlink_unregister(); rtnl_link_unregister(&batadv_link_ops); unregister_netdevice_notifier(&batadv_hard_if_notifier); destroy_workqueue(batadv_event_workqueue); batadv_event_workqueue = NULL; rcu_barrier(); batadv_tt_cache_destroy(); } /** * batadv_mesh_init() - Initialize soft interface * @soft_iface: netdev struct of the soft interface * * Return: 0 on success or negative error number in case of failure */ int batadv_mesh_init(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); int ret; spin_lock_init(&bat_priv->forw_bat_list_lock); spin_lock_init(&bat_priv->forw_bcast_list_lock); spin_lock_init(&bat_priv->tt.changes_list_lock); spin_lock_init(&bat_priv->tt.req_list_lock); spin_lock_init(&bat_priv->tt.roam_list_lock); spin_lock_init(&bat_priv->tt.last_changeset_lock); spin_lock_init(&bat_priv->tt.commit_lock); spin_lock_init(&bat_priv->gw.list_lock); #ifdef CONFIG_BATMAN_ADV_MCAST spin_lock_init(&bat_priv->mcast.mla_lock); spin_lock_init(&bat_priv->mcast.want_lists_lock); #endif spin_lock_init(&bat_priv->tvlv.container_list_lock); spin_lock_init(&bat_priv->tvlv.handler_list_lock); spin_lock_init(&bat_priv->softif_vlan_list_lock); spin_lock_init(&bat_priv->tp_list_lock); INIT_HLIST_HEAD(&bat_priv->forw_bat_list); INIT_HLIST_HEAD(&bat_priv->forw_bcast_list); INIT_HLIST_HEAD(&bat_priv->gw.gateway_list); #ifdef CONFIG_BATMAN_ADV_MCAST INIT_HLIST_HEAD(&bat_priv->mcast.want_all_unsnoopables_list); INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv4_list); INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv6_list); #endif INIT_LIST_HEAD(&bat_priv->tt.changes_list); INIT_HLIST_HEAD(&bat_priv->tt.req_list); INIT_LIST_HEAD(&bat_priv->tt.roam_list); #ifdef CONFIG_BATMAN_ADV_MCAST INIT_HLIST_HEAD(&bat_priv->mcast.mla_list); #endif INIT_HLIST_HEAD(&bat_priv->tvlv.container_list); INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list); INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); INIT_HLIST_HEAD(&bat_priv->tp_list); bat_priv->gw.generation = 0; ret = batadv_originator_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_orig; } ret = batadv_tt_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_tt; } ret = batadv_v_mesh_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_v; } ret = batadv_bla_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_bla; } ret = batadv_dat_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_dat; } ret = batadv_nc_mesh_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_nc; } batadv_gw_init(bat_priv); batadv_mcast_init(bat_priv); atomic_set(&bat_priv->gw.reselect, 0); atomic_set(&bat_priv->mesh_state, BATADV_MESH_ACTIVE); return 0; err_nc: batadv_dat_free(bat_priv); err_dat: batadv_bla_free(bat_priv); err_bla: batadv_v_mesh_free(bat_priv); err_v: batadv_tt_free(bat_priv); err_tt: batadv_originator_free(bat_priv); err_orig: batadv_purge_outstanding_packets(bat_priv, NULL); atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); return ret; } /** * batadv_mesh_free() - Deinitialize soft interface * @soft_iface: netdev struct of the soft interface */ void batadv_mesh_free(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); batadv_purge_outstanding_packets(bat_priv, NULL); batadv_gw_node_free(bat_priv); batadv_v_mesh_free(bat_priv); batadv_nc_mesh_free(bat_priv); batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); batadv_mcast_free(bat_priv); /* Free the TT and the originator tables only after having terminated * all the other depending components which may use these structures for * their purposes. */ batadv_tt_free(bat_priv); /* Since the originator table clean up routine is accessing the TT * tables as well, it has to be invoked after the TT tables have been * freed and marked as empty. This ensures that no cleanup RCU callbacks * accessing the TT data are scheduled for later execution. */ batadv_originator_free(bat_priv); batadv_gw_free(bat_priv); free_percpu(bat_priv->bat_counters); bat_priv->bat_counters = NULL; atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); } /** * batadv_is_my_mac() - check if the given mac address belongs to any of the * real interfaces in the current mesh * @bat_priv: the bat priv with all the soft interface information * @addr: the address to check * * Return: 'true' if the mac address was found, false otherwise. */ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) { const struct batadv_hard_iface *hard_iface; bool is_my_mac = false; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) { is_my_mac = true; break; } } rcu_read_unlock(); return is_my_mac; } /** * batadv_max_header_len() - calculate maximum encapsulation overhead for a * payload packet * * Return: the maximum encapsulation overhead in bytes. */ int batadv_max_header_len(void) { int header_len = 0; header_len = max_t(int, header_len, sizeof(struct batadv_unicast_packet)); header_len = max_t(int, header_len, sizeof(struct batadv_unicast_4addr_packet)); header_len = max_t(int, header_len, sizeof(struct batadv_bcast_packet)); #ifdef CONFIG_BATMAN_ADV_NC header_len = max_t(int, header_len, sizeof(struct batadv_coded_packet)); #endif return header_len + ETH_HLEN; } /** * batadv_skb_set_priority() - sets skb priority according to packet content * @skb: the packet to be sent * @offset: offset to the packet content * * This function sets a value between 256 and 263 (802.1d priority), which * can be interpreted by the cfg80211 or other drivers. */ void batadv_skb_set_priority(struct sk_buff *skb, int offset) { struct iphdr ip_hdr_tmp, *ip_hdr; struct ipv6hdr ip6_hdr_tmp, *ip6_hdr; struct ethhdr ethhdr_tmp, *ethhdr; struct vlan_ethhdr *vhdr, vhdr_tmp; u32 prio; /* already set, do nothing */ if (skb->priority >= 256 && skb->priority <= 263) return; ethhdr = skb_header_pointer(skb, offset, sizeof(*ethhdr), &ethhdr_tmp); if (!ethhdr) return; switch (ethhdr->h_proto) { case htons(ETH_P_8021Q): vhdr = skb_header_pointer(skb, offset + sizeof(*vhdr), sizeof(*vhdr), &vhdr_tmp); if (!vhdr) return; prio = ntohs(vhdr->h_vlan_TCI) & VLAN_PRIO_MASK; prio = prio >> VLAN_PRIO_SHIFT; break; case htons(ETH_P_IP): ip_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr), sizeof(*ip_hdr), &ip_hdr_tmp); if (!ip_hdr) return; prio = (ipv4_get_dsfield(ip_hdr) & 0xfc) >> 5; break; case htons(ETH_P_IPV6): ip6_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr), sizeof(*ip6_hdr), &ip6_hdr_tmp); if (!ip6_hdr) return; prio = (ipv6_get_dsfield(ip6_hdr) & 0xfc) >> 5; break; default: return; } skb->priority = prio + 256; } static int batadv_recv_unhandled_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { kfree_skb(skb); return NET_RX_DROP; } /* incoming packets with the batman ethertype received on any active hard * interface */ /** * batadv_batman_skb_recv() - Handle incoming message from an hard interface * @skb: the received packet * @dev: the net device that the packet was received on * @ptype: packet type of incoming packet (ETH_P_BATMAN) * @orig_dev: the original receive net device (e.g. bonded device) * * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure */ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { struct batadv_priv *bat_priv; struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *hard_iface; u8 idx; hard_iface = container_of(ptype, struct batadv_hard_iface, batman_adv_ptype); /* Prevent processing a packet received on an interface which is getting * shut down otherwise the packet may trigger de-reference errors * further down in the receive path. */ if (!kref_get_unless_zero(&hard_iface->refcount)) goto err_out; skb = skb_share_check(skb, GFP_ATOMIC); /* skb was released by skb_share_check() */ if (!skb) goto err_put; /* packet should hold at least type and version */ if (unlikely(!pskb_may_pull(skb, 2))) goto err_free; /* expect a valid ethernet header here. */ if (unlikely(skb->mac_len != ETH_HLEN || !skb_mac_header(skb))) goto err_free; if (!hard_iface->soft_iface) goto err_free; bat_priv = netdev_priv(hard_iface->soft_iface); if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto err_free; /* discard frames on not active interfaces */ if (hard_iface->if_status != BATADV_IF_ACTIVE) goto err_free; batadv_ogm_packet = (struct batadv_ogm_packet *)skb->data; if (batadv_ogm_packet->version != BATADV_COMPAT_VERSION) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: incompatible batman version (%i)\n", batadv_ogm_packet->version); goto err_free; } /* reset control block to avoid left overs from previous users */ memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); idx = batadv_ogm_packet->packet_type; (*batadv_rx_handler[idx])(skb, hard_iface); batadv_hardif_put(hard_iface); /* return NET_RX_SUCCESS in any case as we * most probably dropped the packet for * routing-logical reasons. */ return NET_RX_SUCCESS; err_free: kfree_skb(skb); err_put: batadv_hardif_put(hard_iface); err_out: return NET_RX_DROP; } static void batadv_recv_handler_init(void) { int i; for (i = 0; i < ARRAY_SIZE(batadv_rx_handler); i++) batadv_rx_handler[i] = batadv_recv_unhandled_packet; for (i = BATADV_UNICAST_MIN; i <= BATADV_UNICAST_MAX; i++) batadv_rx_handler[i] = batadv_recv_unhandled_unicast_packet; /* compile time checks for sizes */ BUILD_BUG_ON(sizeof(struct batadv_bla_claim_dst) != 6); BUILD_BUG_ON(sizeof(struct batadv_ogm_packet) != 24); BUILD_BUG_ON(sizeof(struct batadv_icmp_header) != 20); BUILD_BUG_ON(sizeof(struct batadv_icmp_packet) != 20); BUILD_BUG_ON(sizeof(struct batadv_icmp_packet_rr) != 116); BUILD_BUG_ON(sizeof(struct batadv_unicast_packet) != 10); BUILD_BUG_ON(sizeof(struct batadv_unicast_4addr_packet) != 18); BUILD_BUG_ON(sizeof(struct batadv_frag_packet) != 20); BUILD_BUG_ON(sizeof(struct batadv_bcast_packet) != 14); BUILD_BUG_ON(sizeof(struct batadv_coded_packet) != 46); BUILD_BUG_ON(sizeof(struct batadv_unicast_tvlv_packet) != 20); BUILD_BUG_ON(sizeof(struct batadv_tvlv_hdr) != 4); BUILD_BUG_ON(sizeof(struct batadv_tvlv_gateway_data) != 8); BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_vlan_data) != 8); BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_change) != 12); BUILD_BUG_ON(sizeof(struct batadv_tvlv_roam_adv) != 8); i = sizeof_field(struct sk_buff, cb); BUILD_BUG_ON(sizeof(struct batadv_skb_cb) > i); /* broadcast packet */ batadv_rx_handler[BATADV_BCAST] = batadv_recv_bcast_packet; /* multicast packet */ batadv_rx_handler[BATADV_MCAST] = batadv_recv_mcast_packet; /* unicast packets ... */ /* unicast with 4 addresses packet */ batadv_rx_handler[BATADV_UNICAST_4ADDR] = batadv_recv_unicast_packet; /* unicast packet */ batadv_rx_handler[BATADV_UNICAST] = batadv_recv_unicast_packet; /* unicast tvlv packet */ batadv_rx_handler[BATADV_UNICAST_TVLV] = batadv_recv_unicast_tvlv; /* batman icmp packet */ batadv_rx_handler[BATADV_ICMP] = batadv_recv_icmp_packet; /* Fragmented packets */ batadv_rx_handler[BATADV_UNICAST_FRAG] = batadv_recv_frag_packet; } /** * batadv_recv_handler_register() - Register handler for batman-adv packet type * @packet_type: batadv_packettype which should be handled * @recv_handler: receive handler for the packet type * * Return: 0 on success or negative error number in case of failure */ int batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)) { int (*curr)(struct sk_buff *skb, struct batadv_hard_iface *recv_if); curr = batadv_rx_handler[packet_type]; if (curr != batadv_recv_unhandled_packet && curr != batadv_recv_unhandled_unicast_packet) return -EBUSY; batadv_rx_handler[packet_type] = recv_handler; return 0; } /** * batadv_recv_handler_unregister() - Unregister handler for packet type * @packet_type: batadv_packettype which should no longer be handled */ void batadv_recv_handler_unregister(u8 packet_type) { batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet; } /** * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in * the header * @skb: skb pointing to fragmented socket buffers * @payload_ptr: Pointer to position inside the head buffer of the skb * marking the start of the data to be CRC'ed * * payload_ptr must always point to an address in the skb head buffer and not to * a fragment. * * Return: big endian crc32c of the checksummed data */ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) { u32 crc = 0; unsigned int from; unsigned int to = skb->len; struct skb_seq_state st; const u8 *data; unsigned int len; unsigned int consumed = 0; from = (unsigned int)(payload_ptr - skb->data); skb_prepare_seq_read(skb, from, to, &st); while ((len = skb_seq_read(consumed, &data, &st)) != 0) { crc = crc32c(crc, data, len); consumed += len; } return htonl(crc); } /** * batadv_get_vid() - extract the VLAN identifier from skb if any * @skb: the buffer containing the packet * @header_len: length of the batman header preceding the ethernet header * * Return: VID with the BATADV_VLAN_HAS_TAG flag when the packet embedded in the * skb is vlan tagged. Otherwise BATADV_NO_FLAGS. */ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) { struct ethhdr *ethhdr = (struct ethhdr *)(skb->data + header_len); struct vlan_ethhdr *vhdr; unsigned short vid; if (ethhdr->h_proto != htons(ETH_P_8021Q)) return BATADV_NO_FLAGS; if (!pskb_may_pull(skb, header_len + VLAN_ETH_HLEN)) return BATADV_NO_FLAGS; vhdr = (struct vlan_ethhdr *)(skb->data + header_len); vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK; vid |= BATADV_VLAN_HAS_TAG; return vid; } /** * batadv_vlan_ap_isola_get() - return AP isolation status for the given vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * * Return: true if AP isolation is on for the VLAN identified by vid, false * otherwise */ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) { bool ap_isolation_enabled = false; struct batadv_softif_vlan *vlan; /* if the AP isolation is requested on a VLAN, then check for its * setting in the proper VLAN private data structure */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { ap_isolation_enabled = atomic_read(&vlan->ap_isolation); batadv_softif_vlan_put(vlan); } return ap_isolation_enabled; } /** * batadv_throw_uevent() - Send an uevent with batman-adv specific env data * @bat_priv: the bat priv with all the soft interface information * @type: subsystem type of event. Stored in uevent's BATTYPE * @action: action type of event. Stored in uevent's BATACTION * @data: string with additional information to the event (ignored for * BATADV_UEV_DEL). Stored in uevent's BATDATA * * Return: 0 on success or negative error number in case of failure */ int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, enum batadv_uev_action action, const char *data) { int ret = -ENOMEM; struct kobject *bat_kobj; char *uevent_env[4] = { NULL, NULL, NULL, NULL }; bat_kobj = &bat_priv->soft_iface->dev.kobj; uevent_env[0] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_TYPE_VAR, batadv_uev_type_str[type]); if (!uevent_env[0]) goto report_error; uevent_env[1] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_ACTION_VAR, batadv_uev_action_str[action]); if (!uevent_env[1]) goto free_first_env; /* If the event is DEL, ignore the data field */ if (action != BATADV_UEV_DEL) { uevent_env[2] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_DATA_VAR, data); if (!uevent_env[2]) goto free_second_env; } ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env); kfree(uevent_env[2]); free_second_env: kfree(uevent_env[1]); free_first_env: kfree(uevent_env[0]); if (ret) report_error: batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n", batadv_uev_type_str[type], batadv_uev_action_str[action], (action == BATADV_UEV_DEL ? "NULL" : data), ret); return ret; } module_init(batadv_init); module_exit(batadv_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR(BATADV_DRIVER_AUTHOR); MODULE_DESCRIPTION(BATADV_DRIVER_DESC); MODULE_VERSION(BATADV_SOURCE_VERSION); MODULE_ALIAS_RTNL_LINK("batadv"); MODULE_ALIAS_GENL_FAMILY(BATADV_NL_NAME);
52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/act_sample.c - Packet sampling tc action * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/gfp.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_sample.h> #include <net/tc_act/tc_sample.h> #include <net/psample.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/if_arp.h> static struct tc_action_ops act_sample_ops; static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { [TCA_SAMPLE_PARMS] = { .len = sizeof(struct tc_sample) }, [TCA_SAMPLE_RATE] = { .type = NLA_U32 }, [TCA_SAMPLE_TRUNC_SIZE] = { .type = NLA_U32 }, [TCA_SAMPLE_PSAMPLE_GROUP] = { .type = NLA_U32 }, }; static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_SAMPLE_MAX + 1]; struct psample_group *psample_group; u32 psample_group_num, rate, index; struct tcf_chain *goto_ch = NULL; struct tc_sample *parm; struct tcf_sample *s; bool exists = false; int ret, err; if (!nla) return -EINVAL; ret = nla_parse_nested_deprecated(tb, TCA_SAMPLE_MAX, nla, sample_policy, NULL); if (ret < 0) return ret; if (!tb[TCA_SAMPLE_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_SAMPLE_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_sample_ops, bind, true, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } if (!tb[TCA_SAMPLE_RATE] || !tb[TCA_SAMPLE_PSAMPLE_GROUP]) { NL_SET_ERR_MSG(extack, "sample rate and group are required"); err = -EINVAL; goto release_idr; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); if (!rate) { NL_SET_ERR_MSG(extack, "invalid sample rate"); err = -EINVAL; goto put_chain; } psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, psample_group_num); if (!psample_group) { err = -ENOMEM; goto put_chain; } s = to_sample(*a); spin_lock_bh(&s->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); s->rate = rate; s->psample_group_num = psample_group_num; psample_group = rcu_replace_pointer(s->psample_group, psample_group, lockdep_is_held(&s->tcf_lock)); if (tb[TCA_SAMPLE_TRUNC_SIZE]) { s->truncate = true; s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); } spin_unlock_bh(&s->tcf_lock); if (psample_group) psample_group_put(psample_group); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static void tcf_sample_cleanup(struct tc_action *a) { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; /* last reference to action, no need to lock */ psample_group = rcu_dereference_protected(s->psample_group, 1); RCU_INIT_POINTER(s->psample_group, NULL); if (psample_group) psample_group_put(psample_group); } static bool tcf_sample_dev_ok_push(struct net_device *dev) { switch (dev->type) { case ARPHRD_TUNNEL: case ARPHRD_TUNNEL6: case ARPHRD_SIT: case ARPHRD_IPGRE: case ARPHRD_IP6GRE: case ARPHRD_VOID: case ARPHRD_NONE: return false; default: return true; } } TC_INDIRECT_SCOPE int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; u8 cookie_data[TC_COOKIE_MAX_SIZE]; struct psample_metadata md = {}; struct tc_cookie *user_cookie; int retval; tcf_lastuse_update(&s->tcf_tm); bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb); retval = READ_ONCE(s->tcf_action); psample_group = rcu_dereference_bh(s->psample_group); /* randomly sample packets according to rate */ if (psample_group && (get_random_u32_below(s->rate) == 0)) { if (!skb_at_tc_ingress(skb)) { md.in_ifindex = skb->skb_iif; md.out_ifindex = skb->dev->ifindex; } else { md.in_ifindex = skb->dev->ifindex; } /* on ingress, the mac header gets popped, so push it back */ if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_push(skb, skb->mac_len); rcu_read_lock(); user_cookie = rcu_dereference(a->user_cookie); if (user_cookie) { memcpy(cookie_data, user_cookie->data, user_cookie->len); md.user_cookie = cookie_data; md.user_cookie_len = user_cookie->len; } rcu_read_unlock(); md.trunc_size = s->truncate ? s->trunc_size : skb->len; psample_sample_packet(psample_group, skb, s->rate, &md); if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_pull(skb, skb->mac_len); } return retval; } static void tcf_sample_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_sample *s = to_sample(a); struct tcf_t *tm = &s->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_sample *s = to_sample(a); struct tc_sample opt = { .index = s->tcf_index, .refcnt = refcount_read(&s->tcf_refcnt) - ref, .bindcnt = atomic_read(&s->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&s->tcf_lock); opt.action = s->tcf_action; if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &s->tcf_tm); if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD)) goto nla_put_failure; if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate)) goto nla_put_failure; if (s->truncate) if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size)) goto nla_put_failure; if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num)) goto nla_put_failure; spin_unlock_bh(&s->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&s->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_psample_group_put(void *priv) { struct psample_group *group = priv; psample_group_put(group); } static struct psample_group * tcf_sample_get_group(const struct tc_action *a, tc_action_priv_destructor *destructor) { struct tcf_sample *s = to_sample(a); struct psample_group *group; group = rcu_dereference_protected(s->psample_group, lockdep_is_held(&s->tcf_lock)); if (group) { psample_group_take(group); *destructor = tcf_psample_group_put; } return group; } static void tcf_offload_sample_get_group(struct flow_action_entry *entry, const struct tc_action *act) { entry->sample.psample_group = act->ops->get_psample_group(act, &entry->destructor); entry->destructor_priv = entry->sample.psample_group; } static int tcf_sample_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; entry->id = FLOW_ACTION_SAMPLE; entry->sample.trunc_size = tcf_sample_trunc_size(act); entry->sample.truncate = tcf_sample_truncate(act); entry->sample.rate = tcf_sample_rate(act); tcf_offload_sample_get_group(entry, act); *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; fl_action->id = FLOW_ACTION_SAMPLE; } return 0; } static struct tc_action_ops act_sample_ops = { .kind = "sample", .id = TCA_ID_SAMPLE, .owner = THIS_MODULE, .act = tcf_sample_act, .stats_update = tcf_sample_stats_update, .dump = tcf_sample_dump, .init = tcf_sample_init, .cleanup = tcf_sample_cleanup, .get_psample_group = tcf_sample_get_group, .offload_act_setup = tcf_sample_offload_act_setup, .size = sizeof(struct tcf_sample), }; MODULE_ALIAS_NET_ACT("sample"); static __net_init int sample_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id); return tc_action_net_init(net, tn, &act_sample_ops); } static void __net_exit sample_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_sample_ops.net_id); } static struct pernet_operations sample_net_ops = { .init = sample_init_net, .exit_batch = sample_exit_net, .id = &act_sample_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init sample_init_module(void) { return tcf_register_action(&act_sample_ops, &sample_net_ops); } static void __exit sample_cleanup_module(void) { tcf_unregister_action(&act_sample_ops, &sample_net_ops); } module_init(sample_init_module); module_exit(sample_cleanup_module); MODULE_AUTHOR("Yotam Gigi <yotam.gi@gmail.com>"); MODULE_DESCRIPTION("Packet sampling action"); MODULE_LICENSE("GPL v2");
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright(c) 1999 - 2004 Intel Corporation. All rights reserved. */ #include <linux/skbuff.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/if_bonding.h> #include <linux/pkt_sched.h> #include <net/net_namespace.h> #include <net/bonding.h> #include <net/bond_3ad.h> #include <net/netlink.h> /* General definitions */ #define AD_SHORT_TIMEOUT 1 #define AD_LONG_TIMEOUT 0 #define AD_STANDBY 0x2 #define AD_MAX_TX_IN_SECOND 3 #define AD_COLLECTOR_MAX_DELAY 0 /* Timer definitions (43.4.4 in the 802.3ad standard) */ #define AD_FAST_PERIODIC_TIME 1 #define AD_SLOW_PERIODIC_TIME 30 #define AD_SHORT_TIMEOUT_TIME (3*AD_FAST_PERIODIC_TIME) #define AD_LONG_TIMEOUT_TIME (3*AD_SLOW_PERIODIC_TIME) #define AD_CHURN_DETECTION_TIME 60 #define AD_AGGREGATE_WAIT_TIME 2 /* Port Variables definitions used by the State Machines (43.4.7 in the * 802.3ad standard) */ #define AD_PORT_BEGIN 0x1 #define AD_PORT_LACP_ENABLED 0x2 #define AD_PORT_ACTOR_CHURN 0x4 #define AD_PORT_PARTNER_CHURN 0x8 #define AD_PORT_READY 0x10 #define AD_PORT_READY_N 0x20 #define AD_PORT_MATCHED 0x40 #define AD_PORT_STANDBY 0x80 #define AD_PORT_SELECTED 0x100 #define AD_PORT_MOVED 0x200 #define AD_PORT_CHURNED (AD_PORT_ACTOR_CHURN | AD_PORT_PARTNER_CHURN) /* Port Key definitions * key is determined according to the link speed, duplex and * user key (which is yet not supported) * -------------------------------------------------------------- * Port key | User key (10 bits) | Speed (5 bits) | Duplex| * -------------------------------------------------------------- * |15 6|5 1|0 */ #define AD_DUPLEX_KEY_MASKS 0x1 #define AD_SPEED_KEY_MASKS 0x3E #define AD_USER_KEY_MASKS 0xFFC0 enum ad_link_speed_type { AD_LINK_SPEED_1MBPS = 1, AD_LINK_SPEED_10MBPS, AD_LINK_SPEED_100MBPS, AD_LINK_SPEED_1000MBPS, AD_LINK_SPEED_2500MBPS, AD_LINK_SPEED_5000MBPS, AD_LINK_SPEED_10000MBPS, AD_LINK_SPEED_14000MBPS, AD_LINK_SPEED_20000MBPS, AD_LINK_SPEED_25000MBPS, AD_LINK_SPEED_40000MBPS, AD_LINK_SPEED_50000MBPS, AD_LINK_SPEED_56000MBPS, AD_LINK_SPEED_100000MBPS, AD_LINK_SPEED_200000MBPS, AD_LINK_SPEED_400000MBPS, AD_LINK_SPEED_800000MBPS, }; /* compare MAC addresses */ #define MAC_ADDRESS_EQUAL(A, B) \ ether_addr_equal_64bits((const u8 *)A, (const u8 *)B) static const u16 ad_ticks_per_sec = 1000 / AD_TIMER_INTERVAL; static const int ad_delta_in_ticks = (AD_TIMER_INTERVAL * HZ) / 1000; const u8 lacpdu_mcast_addr[ETH_ALEN + 2] __long_aligned = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x02 }; /* ================= main 802.3ad protocol functions ================== */ static int ad_lacpdu_send(struct port *port); static int ad_marker_send(struct port *port, struct bond_marker *marker); static void ad_mux_machine(struct port *port, bool *update_slave_arr); static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port); static void ad_tx_machine(struct port *port); static void ad_periodic_machine(struct port *port, struct bond_params *bond_params); static void ad_port_selection_logic(struct port *port, bool *update_slave_arr); static void ad_agg_selection_logic(struct aggregator *aggregator, bool *update_slave_arr); static void ad_clear_agg(struct aggregator *aggregator); static void ad_initialize_agg(struct aggregator *aggregator); static void ad_initialize_port(struct port *port, int lacp_fast); static void ad_enable_collecting(struct port *port); static void ad_disable_distributing(struct port *port, bool *update_slave_arr); static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr); static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr); static void ad_marker_info_received(struct bond_marker *marker_info, struct port *port); static void ad_marker_response_received(struct bond_marker *marker, struct port *port); static void ad_update_actor_keys(struct port *port, bool reset); /* ================= api to bonding and kernel code ================== */ /** * __get_bond_by_port - get the port's bonding struct * @port: the port we're looking at * * Return @port's bonding struct, or %NULL if it can't be found. */ static inline struct bonding *__get_bond_by_port(struct port *port) { if (port->slave == NULL) return NULL; return bond_get_bond_by_slave(port->slave); } /** * __get_first_agg - get the first aggregator in the bond * @port: the port we're looking at * * Return the aggregator of the first slave in @bond, or %NULL if it can't be * found. * The caller must hold RCU or RTNL lock. */ static inline struct aggregator *__get_first_agg(struct port *port) { struct bonding *bond = __get_bond_by_port(port); struct slave *first_slave; struct aggregator *agg; /* If there's no bond for this port, or bond has no slaves */ if (bond == NULL) return NULL; rcu_read_lock(); first_slave = bond_first_slave_rcu(bond); agg = first_slave ? &(SLAVE_AD_INFO(first_slave)->aggregator) : NULL; rcu_read_unlock(); return agg; } /** * __agg_has_partner - see if we have a partner * @agg: the agregator we're looking at * * Return nonzero if aggregator has a partner (denoted by a non-zero ether * address for the partner). Return 0 if not. */ static inline int __agg_has_partner(struct aggregator *agg) { return !is_zero_ether_addr(agg->partner_system.mac_addr_value); } /** * __disable_distributing_port - disable the port's slave for distributing. * Port will still be able to collect. * @port: the port we're looking at * * This will disable only distributing on the port's slave. */ static void __disable_distributing_port(struct port *port) { bond_set_slave_tx_disabled_flags(port->slave, BOND_SLAVE_NOTIFY_LATER); } /** * __enable_collecting_port - enable the port's slave for collecting, * if it's up * @port: the port we're looking at * * This will enable only collecting on the port's slave. */ static void __enable_collecting_port(struct port *port) { struct slave *slave = port->slave; if (slave->link == BOND_LINK_UP && bond_slave_is_up(slave)) bond_set_slave_rx_enabled_flags(slave, BOND_SLAVE_NOTIFY_LATER); } /** * __disable_port - disable the port's slave * @port: the port we're looking at * * This will disable both collecting and distributing on the port's slave. */ static inline void __disable_port(struct port *port) { bond_set_slave_inactive_flags(port->slave, BOND_SLAVE_NOTIFY_LATER); } /** * __enable_port - enable the port's slave, if it's up * @port: the port we're looking at * * This will enable both collecting and distributing on the port's slave. */ static inline void __enable_port(struct port *port) { struct slave *slave = port->slave; if ((slave->link == BOND_LINK_UP) && bond_slave_is_up(slave)) bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_LATER); } /** * __port_move_to_attached_state - check if port should transition back to attached * state. * @port: the port we're looking at */ static bool __port_move_to_attached_state(struct port *port) { if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY) || !(port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) || !(port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) port->sm_mux_state = AD_MUX_ATTACHED; return port->sm_mux_state == AD_MUX_ATTACHED; } /** * __port_is_collecting_distributing - check if the port's slave is in the * combined collecting/distributing state * @port: the port we're looking at */ static int __port_is_collecting_distributing(struct port *port) { return bond_is_active_slave(port->slave); } /** * __get_agg_selection_mode - get the aggregator selection mode * @port: the port we're looking at * * Get the aggregator selection mode. Can be %STABLE, %BANDWIDTH or %COUNT. */ static inline u32 __get_agg_selection_mode(struct port *port) { struct bonding *bond = __get_bond_by_port(port); if (bond == NULL) return BOND_AD_STABLE; return bond->params.ad_select; } /** * __check_agg_selection_timer - check if the selection timer has expired * @port: the port we're looking at */ static inline int __check_agg_selection_timer(struct port *port) { struct bonding *bond = __get_bond_by_port(port); if (bond == NULL) return 0; return atomic_read(&BOND_AD_INFO(bond).agg_select_timer) ? 1 : 0; } /** * __get_link_speed - get a port's speed * @port: the port we're looking at * * Return @port's speed in 802.3ad enum format. i.e. one of: * 0, * %AD_LINK_SPEED_10MBPS, * %AD_LINK_SPEED_100MBPS, * %AD_LINK_SPEED_1000MBPS, * %AD_LINK_SPEED_2500MBPS, * %AD_LINK_SPEED_5000MBPS, * %AD_LINK_SPEED_10000MBPS * %AD_LINK_SPEED_14000MBPS, * %AD_LINK_SPEED_20000MBPS * %AD_LINK_SPEED_25000MBPS * %AD_LINK_SPEED_40000MBPS * %AD_LINK_SPEED_50000MBPS * %AD_LINK_SPEED_56000MBPS * %AD_LINK_SPEED_100000MBPS * %AD_LINK_SPEED_200000MBPS * %AD_LINK_SPEED_400000MBPS * %AD_LINK_SPEED_800000MBPS */ static u16 __get_link_speed(struct port *port) { struct slave *slave = port->slave; u16 speed; /* this if covers only a special case: when the configuration starts * with link down, it sets the speed to 0. * This is done in spite of the fact that the e100 driver reports 0 * to be compatible with MVT in the future. */ if (slave->link != BOND_LINK_UP) speed = 0; else { switch (slave->speed) { case SPEED_10: speed = AD_LINK_SPEED_10MBPS; break; case SPEED_100: speed = AD_LINK_SPEED_100MBPS; break; case SPEED_1000: speed = AD_LINK_SPEED_1000MBPS; break; case SPEED_2500: speed = AD_LINK_SPEED_2500MBPS; break; case SPEED_5000: speed = AD_LINK_SPEED_5000MBPS; break; case SPEED_10000: speed = AD_LINK_SPEED_10000MBPS; break; case SPEED_14000: speed = AD_LINK_SPEED_14000MBPS; break; case SPEED_20000: speed = AD_LINK_SPEED_20000MBPS; break; case SPEED_25000: speed = AD_LINK_SPEED_25000MBPS; break; case SPEED_40000: speed = AD_LINK_SPEED_40000MBPS; break; case SPEED_50000: speed = AD_LINK_SPEED_50000MBPS; break; case SPEED_56000: speed = AD_LINK_SPEED_56000MBPS; break; case SPEED_100000: speed = AD_LINK_SPEED_100000MBPS; break; case SPEED_200000: speed = AD_LINK_SPEED_200000MBPS; break; case SPEED_400000: speed = AD_LINK_SPEED_400000MBPS; break; case SPEED_800000: speed = AD_LINK_SPEED_800000MBPS; break; default: /* unknown speed value from ethtool. shouldn't happen */ if (slave->speed != SPEED_UNKNOWN) pr_err_once("%s: (slave %s): unknown ethtool speed (%d) for port %d (set it to 0)\n", slave->bond->dev->name, slave->dev->name, slave->speed, port->actor_port_number); speed = 0; break; } } slave_dbg(slave->bond->dev, slave->dev, "Port %d Received link speed %d update from adapter\n", port->actor_port_number, speed); return speed; } /** * __get_duplex - get a port's duplex * @port: the port we're looking at * * Return @port's duplex in 802.3ad bitmask format. i.e.: * 0x01 if in full duplex * 0x00 otherwise */ static u8 __get_duplex(struct port *port) { struct slave *slave = port->slave; u8 retval = 0x0; /* handling a special case: when the configuration starts with * link down, it sets the duplex to 0. */ if (slave->link == BOND_LINK_UP) { switch (slave->duplex) { case DUPLEX_FULL: retval = 0x1; slave_dbg(slave->bond->dev, slave->dev, "Port %d Received status full duplex update from adapter\n", port->actor_port_number); break; case DUPLEX_HALF: default: retval = 0x0; slave_dbg(slave->bond->dev, slave->dev, "Port %d Received status NOT full duplex update from adapter\n", port->actor_port_number); break; } } return retval; } static void __ad_actor_update_port(struct port *port) { const struct bonding *bond = bond_get_bond_by_slave(port->slave); port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr; port->actor_system_priority = BOND_AD_INFO(bond).system.sys_priority; } /* Conversions */ /** * __ad_timer_to_ticks - convert a given timer type to AD module ticks * @timer_type: which timer to operate * @par: timer parameter. see below * * If @timer_type is %current_while_timer, @par indicates long/short timer. * If @timer_type is %periodic_timer, @par is one of %FAST_PERIODIC_TIME, * %SLOW_PERIODIC_TIME. */ static u16 __ad_timer_to_ticks(u16 timer_type, u16 par) { u16 retval = 0; /* to silence the compiler */ switch (timer_type) { case AD_CURRENT_WHILE_TIMER: /* for rx machine usage */ if (par) retval = (AD_SHORT_TIMEOUT_TIME*ad_ticks_per_sec); else retval = (AD_LONG_TIMEOUT_TIME*ad_ticks_per_sec); break; case AD_ACTOR_CHURN_TIMER: /* for local churn machine */ retval = (AD_CHURN_DETECTION_TIME*ad_ticks_per_sec); break; case AD_PERIODIC_TIMER: /* for periodic machine */ retval = (par*ad_ticks_per_sec); /* long timeout */ break; case AD_PARTNER_CHURN_TIMER: /* for remote churn machine */ retval = (AD_CHURN_DETECTION_TIME*ad_ticks_per_sec); break; case AD_WAIT_WHILE_TIMER: /* for selection machine */ retval = (AD_AGGREGATE_WAIT_TIME*ad_ticks_per_sec); break; } return retval; } /* ================= ad_rx_machine helper functions ================== */ /** * __choose_matched - update a port's matched variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Update the value of the matched variable, using parameter values from a * newly received lacpdu. Parameter values for the partner carried in the * received PDU are compared with the corresponding operational parameter * values for the actor. Matched is set to TRUE if all of these parameters * match and the PDU parameter partner_state.aggregation has the same value as * actor_oper_port_state.aggregation and lacp will actively maintain the link * in the aggregation. Matched is also set to TRUE if the value of * actor_state.aggregation in the received PDU is set to FALSE, i.e., indicates * an individual link and lacp will actively maintain the link. Otherwise, * matched is set to FALSE. LACP is considered to be actively maintaining the * link if either the PDU's actor_state.lacp_activity variable is TRUE or both * the actor's actor_oper_port_state.lacp_activity and the PDU's * partner_state.lacp_activity variables are TRUE. * * Note: the AD_PORT_MATCHED "variable" is not specified by 802.3ad; it is * used here to implement the language from 802.3ad 43.4.9 that requires * recordPDU to "match" the LACPDU parameters to the stored values. */ static void __choose_matched(struct lacpdu *lacpdu, struct port *port) { /* check if all parameters are alike * or this is individual link(aggregation == FALSE) * then update the state machine Matched variable. */ if (((ntohs(lacpdu->partner_port) == port->actor_port_number) && (ntohs(lacpdu->partner_port_priority) == port->actor_port_priority) && MAC_ADDRESS_EQUAL(&(lacpdu->partner_system), &(port->actor_system)) && (ntohs(lacpdu->partner_system_priority) == port->actor_system_priority) && (ntohs(lacpdu->partner_key) == port->actor_oper_port_key) && ((lacpdu->partner_state & LACP_STATE_AGGREGATION) == (port->actor_oper_port_state & LACP_STATE_AGGREGATION))) || ((lacpdu->actor_state & LACP_STATE_AGGREGATION) == 0) ) { port->sm_vars |= AD_PORT_MATCHED; } else { port->sm_vars &= ~AD_PORT_MATCHED; } } /** * __record_pdu - record parameters from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Record the parameter values for the Actor carried in a received lacpdu as * the current partner operational parameter values and sets * actor_oper_port_state.defaulted to FALSE. */ static void __record_pdu(struct lacpdu *lacpdu, struct port *port) { if (lacpdu && port) { struct port_params *partner = &port->partner_oper; __choose_matched(lacpdu, port); /* record the new parameter values for the partner * operational */ partner->port_number = ntohs(lacpdu->actor_port); partner->port_priority = ntohs(lacpdu->actor_port_priority); partner->system = lacpdu->actor_system; partner->system_priority = ntohs(lacpdu->actor_system_priority); partner->key = ntohs(lacpdu->actor_key); partner->port_state = lacpdu->actor_state; /* set actor_oper_port_state.defaulted to FALSE */ port->actor_oper_port_state &= ~LACP_STATE_DEFAULTED; /* set the partner sync. to on if the partner is sync, * and the port is matched */ if ((port->sm_vars & AD_PORT_MATCHED) && (lacpdu->actor_state & LACP_STATE_SYNCHRONIZATION)) { partner->port_state |= LACP_STATE_SYNCHRONIZATION; slave_dbg(port->slave->bond->dev, port->slave->dev, "partner sync=1\n"); } else { partner->port_state &= ~LACP_STATE_SYNCHRONIZATION; slave_dbg(port->slave->bond->dev, port->slave->dev, "partner sync=0\n"); } } } /** * __record_default - record default parameters * @port: the port we're looking at * * This function records the default parameter values for the partner carried * in the Partner Admin parameters as the current partner operational parameter * values and sets actor_oper_port_state.defaulted to TRUE. */ static void __record_default(struct port *port) { if (port) { /* record the partner admin parameters */ memcpy(&port->partner_oper, &port->partner_admin, sizeof(struct port_params)); /* set actor_oper_port_state.defaulted to true */ port->actor_oper_port_state |= LACP_STATE_DEFAULTED; } } /** * __update_selected - update a port's Selected variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Update the value of the selected variable, using parameter values from a * newly received lacpdu. The parameter values for the Actor carried in the * received PDU are compared with the corresponding operational parameter * values for the ports partner. If one or more of the comparisons shows that * the value(s) received in the PDU differ from the current operational values, * then selected is set to FALSE and actor_oper_port_state.synchronization is * set to out_of_sync. Otherwise, selected remains unchanged. */ static void __update_selected(struct lacpdu *lacpdu, struct port *port) { if (lacpdu && port) { const struct port_params *partner = &port->partner_oper; /* check if any parameter is different then * update the state machine selected variable. */ if (ntohs(lacpdu->actor_port) != partner->port_number || ntohs(lacpdu->actor_port_priority) != partner->port_priority || !MAC_ADDRESS_EQUAL(&lacpdu->actor_system, &partner->system) || ntohs(lacpdu->actor_system_priority) != partner->system_priority || ntohs(lacpdu->actor_key) != partner->key || (lacpdu->actor_state & LACP_STATE_AGGREGATION) != (partner->port_state & LACP_STATE_AGGREGATION)) { port->sm_vars &= ~AD_PORT_SELECTED; } } } /** * __update_default_selected - update a port's Selected variable from Partner * @port: the port we're looking at * * This function updates the value of the selected variable, using the partner * administrative parameter values. The administrative values are compared with * the corresponding operational parameter values for the partner. If one or * more of the comparisons shows that the administrative value(s) differ from * the current operational values, then Selected is set to FALSE and * actor_oper_port_state.synchronization is set to OUT_OF_SYNC. Otherwise, * Selected remains unchanged. */ static void __update_default_selected(struct port *port) { if (port) { const struct port_params *admin = &port->partner_admin; const struct port_params *oper = &port->partner_oper; /* check if any parameter is different then * update the state machine selected variable. */ if (admin->port_number != oper->port_number || admin->port_priority != oper->port_priority || !MAC_ADDRESS_EQUAL(&admin->system, &oper->system) || admin->system_priority != oper->system_priority || admin->key != oper->key || (admin->port_state & LACP_STATE_AGGREGATION) != (oper->port_state & LACP_STATE_AGGREGATION)) { port->sm_vars &= ~AD_PORT_SELECTED; } } } /** * __update_ntt - update a port's ntt variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Updates the value of the ntt variable, using parameter values from a newly * received lacpdu. The parameter values for the partner carried in the * received PDU are compared with the corresponding operational parameter * values for the Actor. If one or more of the comparisons shows that the * value(s) received in the PDU differ from the current operational values, * then ntt is set to TRUE. Otherwise, ntt remains unchanged. */ static void __update_ntt(struct lacpdu *lacpdu, struct port *port) { /* validate lacpdu and port */ if (lacpdu && port) { /* check if any parameter is different then * update the port->ntt. */ if ((ntohs(lacpdu->partner_port) != port->actor_port_number) || (ntohs(lacpdu->partner_port_priority) != port->actor_port_priority) || !MAC_ADDRESS_EQUAL(&(lacpdu->partner_system), &(port->actor_system)) || (ntohs(lacpdu->partner_system_priority) != port->actor_system_priority) || (ntohs(lacpdu->partner_key) != port->actor_oper_port_key) || ((lacpdu->partner_state & LACP_STATE_LACP_ACTIVITY) != (port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY)) || ((lacpdu->partner_state & LACP_STATE_LACP_TIMEOUT) != (port->actor_oper_port_state & LACP_STATE_LACP_TIMEOUT)) || ((lacpdu->partner_state & LACP_STATE_SYNCHRONIZATION) != (port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) || ((lacpdu->partner_state & LACP_STATE_AGGREGATION) != (port->actor_oper_port_state & LACP_STATE_AGGREGATION)) ) { port->ntt = true; } } } /** * __agg_ports_are_ready - check if all ports in an aggregator are ready * @aggregator: the aggregator we're looking at * */ static int __agg_ports_are_ready(struct aggregator *aggregator) { struct port *port; int retval = 1; if (aggregator) { /* scan all ports in this aggregator to verfy if they are * all ready. */ for (port = aggregator->lag_ports; port; port = port->next_port_in_aggregator) { if (!(port->sm_vars & AD_PORT_READY_N)) { retval = 0; break; } } } return retval; } /** * __set_agg_ports_ready - set value of Ready bit in all ports of an aggregator * @aggregator: the aggregator we're looking at * @val: Should the ports' ready bit be set on or off * */ static void __set_agg_ports_ready(struct aggregator *aggregator, int val) { struct port *port; for (port = aggregator->lag_ports; port; port = port->next_port_in_aggregator) { if (val) port->sm_vars |= AD_PORT_READY; else port->sm_vars &= ~AD_PORT_READY; } } static int __agg_active_ports(struct aggregator *agg) { struct port *port; int active = 0; for (port = agg->lag_ports; port; port = port->next_port_in_aggregator) { if (port->is_enabled) active++; } return active; } /** * __get_agg_bandwidth - get the total bandwidth of an aggregator * @aggregator: the aggregator we're looking at * */ static u32 __get_agg_bandwidth(struct aggregator *aggregator) { int nports = __agg_active_ports(aggregator); u32 bandwidth = 0; if (nports) { switch (__get_link_speed(aggregator->lag_ports)) { case AD_LINK_SPEED_1MBPS: bandwidth = nports; break; case AD_LINK_SPEED_10MBPS: bandwidth = nports * 10; break; case AD_LINK_SPEED_100MBPS: bandwidth = nports * 100; break; case AD_LINK_SPEED_1000MBPS: bandwidth = nports * 1000; break; case AD_LINK_SPEED_2500MBPS: bandwidth = nports * 2500; break; case AD_LINK_SPEED_5000MBPS: bandwidth = nports * 5000; break; case AD_LINK_SPEED_10000MBPS: bandwidth = nports * 10000; break; case AD_LINK_SPEED_14000MBPS: bandwidth = nports * 14000; break; case AD_LINK_SPEED_20000MBPS: bandwidth = nports * 20000; break; case AD_LINK_SPEED_25000MBPS: bandwidth = nports * 25000; break; case AD_LINK_SPEED_40000MBPS: bandwidth = nports * 40000; break; case AD_LINK_SPEED_50000MBPS: bandwidth = nports * 50000; break; case AD_LINK_SPEED_56000MBPS: bandwidth = nports * 56000; break; case AD_LINK_SPEED_100000MBPS: bandwidth = nports * 100000; break; case AD_LINK_SPEED_200000MBPS: bandwidth = nports * 200000; break; case AD_LINK_SPEED_400000MBPS: bandwidth = nports * 400000; break; case AD_LINK_SPEED_800000MBPS: bandwidth = nports * 800000; break; default: bandwidth = 0; /* to silence the compiler */ } } return bandwidth; } /** * __get_active_agg - get the current active aggregator * @aggregator: the aggregator we're looking at * * Caller must hold RCU lock. */ static struct aggregator *__get_active_agg(struct aggregator *aggregator) { struct bonding *bond = aggregator->slave->bond; struct list_head *iter; struct slave *slave; bond_for_each_slave_rcu(bond, slave, iter) if (SLAVE_AD_INFO(slave)->aggregator.is_active) return &(SLAVE_AD_INFO(slave)->aggregator); return NULL; } /** * __update_lacpdu_from_port - update a port's lacpdu fields * @port: the port we're looking at */ static inline void __update_lacpdu_from_port(struct port *port) { struct lacpdu *lacpdu = &port->lacpdu; const struct port_params *partner = &port->partner_oper; /* update current actual Actor parameters * lacpdu->subtype initialized * lacpdu->version_number initialized * lacpdu->tlv_type_actor_info initialized * lacpdu->actor_information_length initialized */ lacpdu->actor_system_priority = htons(port->actor_system_priority); lacpdu->actor_system = port->actor_system; lacpdu->actor_key = htons(port->actor_oper_port_key); lacpdu->actor_port_priority = htons(port->actor_port_priority); lacpdu->actor_port = htons(port->actor_port_number); lacpdu->actor_state = port->actor_oper_port_state; slave_dbg(port->slave->bond->dev, port->slave->dev, "update lacpdu: actor port state %x\n", port->actor_oper_port_state); /* lacpdu->reserved_3_1 initialized * lacpdu->tlv_type_partner_info initialized * lacpdu->partner_information_length initialized */ lacpdu->partner_system_priority = htons(partner->system_priority); lacpdu->partner_system = partner->system; lacpdu->partner_key = htons(partner->key); lacpdu->partner_port_priority = htons(partner->port_priority); lacpdu->partner_port = htons(partner->port_number); lacpdu->partner_state = partner->port_state; /* lacpdu->reserved_3_2 initialized * lacpdu->tlv_type_collector_info initialized * lacpdu->collector_information_length initialized * collector_max_delay initialized * reserved_12[12] initialized * tlv_type_terminator initialized * terminator_length initialized * reserved_50[50] initialized */ } /* ================= main 802.3ad protocol code ========================= */ /** * ad_lacpdu_send - send out a lacpdu packet on a given port * @port: the port we're looking at * * Returns: 0 on success * < 0 on error */ static int ad_lacpdu_send(struct port *port) { struct slave *slave = port->slave; struct sk_buff *skb; struct lacpdu_header *lacpdu_header; int length = sizeof(struct lacpdu_header); skb = dev_alloc_skb(length); if (!skb) return -ENOMEM; atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.lacpdu_tx); skb->dev = slave->dev; skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; skb->priority = TC_PRIO_CONTROL; lacpdu_header = skb_put(skb, length); ether_addr_copy(lacpdu_header->hdr.h_dest, lacpdu_mcast_addr); /* Note: source address is set to be the member's PERMANENT address, * because we use it to identify loopback lacpdus in receive. */ ether_addr_copy(lacpdu_header->hdr.h_source, slave->perm_hwaddr); lacpdu_header->hdr.h_proto = PKT_TYPE_LACPDU; lacpdu_header->lacpdu = port->lacpdu; dev_queue_xmit(skb); return 0; } /** * ad_marker_send - send marker information/response on a given port * @port: the port we're looking at * @marker: marker data to send * * Returns: 0 on success * < 0 on error */ static int ad_marker_send(struct port *port, struct bond_marker *marker) { struct slave *slave = port->slave; struct sk_buff *skb; struct bond_marker_header *marker_header; int length = sizeof(struct bond_marker_header); skb = dev_alloc_skb(length + 16); if (!skb) return -ENOMEM; switch (marker->tlv_type) { case AD_MARKER_INFORMATION_SUBTYPE: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.marker_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.marker_tx); break; case AD_MARKER_RESPONSE_SUBTYPE: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.marker_resp_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.marker_resp_tx); break; } skb_reserve(skb, 16); skb->dev = slave->dev; skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; marker_header = skb_put(skb, length); ether_addr_copy(marker_header->hdr.h_dest, lacpdu_mcast_addr); /* Note: source address is set to be the member's PERMANENT address, * because we use it to identify loopback MARKERs in receive. */ ether_addr_copy(marker_header->hdr.h_source, slave->perm_hwaddr); marker_header->hdr.h_proto = PKT_TYPE_LACPDU; marker_header->marker = *marker; dev_queue_xmit(skb); return 0; } /** * ad_mux_machine - handle a port's mux state machine * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_mux_machine(struct port *port, bool *update_slave_arr) { struct bonding *bond = __get_bond_by_port(port); mux_states_t last_state; /* keep current State Machine state to compare later if it was * changed */ last_state = port->sm_mux_state; if (port->sm_vars & AD_PORT_BEGIN) { port->sm_mux_state = AD_MUX_DETACHED; } else { switch (port->sm_mux_state) { case AD_MUX_DETACHED: if ((port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY)) /* if SELECTED or STANDBY */ port->sm_mux_state = AD_MUX_WAITING; break; case AD_MUX_WAITING: /* if SELECTED == FALSE return to DETACH state */ if (!(port->sm_vars & AD_PORT_SELECTED)) { port->sm_vars &= ~AD_PORT_READY_N; /* in order to withhold the Selection Logic to * check all ports READY_N value every callback * cycle to update ready variable, we check * READY_N and update READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); port->sm_mux_state = AD_MUX_DETACHED; break; } /* check if the wait_while_timer expired */ if (port->sm_mux_timer_counter && !(--port->sm_mux_timer_counter)) port->sm_vars |= AD_PORT_READY_N; /* in order to withhold the selection logic to check * all ports READY_N value every callback cycle to * update ready variable, we check READY_N and update * READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); /* if the wait_while_timer expired, and the port is * in READY state, move to ATTACHED state */ if ((port->sm_vars & AD_PORT_READY) && !port->sm_mux_timer_counter) port->sm_mux_state = AD_MUX_ATTACHED; break; case AD_MUX_ATTACHED: /* check also if agg_select_timer expired (so the * edable port will take place only after this timer) */ if ((port->sm_vars & AD_PORT_SELECTED) && (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) && !__check_agg_selection_timer(port)) { if (port->aggregator->is_active) { int state = AD_MUX_COLLECTING_DISTRIBUTING; if (!bond->params.coupled_control) state = AD_MUX_COLLECTING; port->sm_mux_state = state; } } else if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY)) { /* if UNSELECTED or STANDBY */ port->sm_vars &= ~AD_PORT_READY_N; /* in order to withhold the selection logic to * check all ports READY_N value every callback * cycle to update ready variable, we check * READY_N and update READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); port->sm_mux_state = AD_MUX_DETACHED; } else if (port->aggregator->is_active) { port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; } break; case AD_MUX_COLLECTING_DISTRIBUTING: if (!__port_move_to_attached_state(port)) { /* if port state hasn't changed make * sure that a collecting distributing * port in an active aggregator is enabled */ if (port->aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; } } break; case AD_MUX_COLLECTING: if (!__port_move_to_attached_state(port)) { if ((port->sm_vars & AD_PORT_SELECTED) && (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) && (port->partner_oper.port_state & LACP_STATE_COLLECTING)) { port->sm_mux_state = AD_MUX_DISTRIBUTING; } else { /* If port state hasn't changed, make sure that a collecting * port is enabled for an active aggregator. */ struct slave *slave = port->slave; if (port->aggregator->is_active && bond_is_slave_rx_disabled(slave)) { ad_enable_collecting(port); *update_slave_arr = true; } } } break; case AD_MUX_DISTRIBUTING: if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY) || !(port->partner_oper.port_state & LACP_STATE_COLLECTING) || !(port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) || !(port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) { port->sm_mux_state = AD_MUX_COLLECTING; } else { /* if port state hasn't changed make * sure that a collecting distributing * port in an active aggregator is enabled */ if (port->aggregator && port->aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; } } break; default: break; } } /* check if the state machine was changed */ if (port->sm_mux_state != last_state) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Mux Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_mux_state); switch (port->sm_mux_state) { case AD_MUX_DETACHED: port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; ad_disable_collecting_distributing(port, update_slave_arr); port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->ntt = true; break; case AD_MUX_WAITING: port->sm_mux_timer_counter = __ad_timer_to_ticks(AD_WAIT_WHILE_TIMER, 0); break; case AD_MUX_ATTACHED: if (port->aggregator->is_active) port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; else port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; ad_disable_collecting_distributing(port, update_slave_arr); port->ntt = true; break; case AD_MUX_COLLECTING_DISTRIBUTING: port->actor_oper_port_state |= LACP_STATE_COLLECTING; port->actor_oper_port_state |= LACP_STATE_DISTRIBUTING; port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; ad_enable_collecting_distributing(port, update_slave_arr); port->ntt = true; break; case AD_MUX_COLLECTING: port->actor_oper_port_state |= LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; ad_enable_collecting(port); ad_disable_distributing(port, update_slave_arr); port->ntt = true; break; case AD_MUX_DISTRIBUTING: port->actor_oper_port_state |= LACP_STATE_DISTRIBUTING; port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; ad_enable_collecting_distributing(port, update_slave_arr); break; default: break; } } } /** * ad_rx_machine - handle a port's rx State Machine * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * If lacpdu arrived, stop previous timer (if exists) and set the next state as * CURRENT. If timer expired set the state machine in the proper state. * In other cases, this function checks if we need to switch to other state. */ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) { rx_states_t last_state; /* keep current State Machine state to compare later if it was * changed */ last_state = port->sm_rx_state; if (lacpdu) { atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.lacpdu_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.lacpdu_rx); } /* check if state machine should change state */ /* first, check if port was reinitialized */ if (port->sm_vars & AD_PORT_BEGIN) { port->sm_rx_state = AD_RX_INITIALIZE; port->sm_vars |= AD_PORT_CHURNED; /* check if port is not enabled */ } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled) port->sm_rx_state = AD_RX_PORT_DISABLED; /* check if new lacpdu arrived */ else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) || (port->sm_rx_state == AD_RX_DEFAULTED) || (port->sm_rx_state == AD_RX_CURRENT))) { if (port->sm_rx_state != AD_RX_CURRENT) port->sm_vars |= AD_PORT_CHURNED; port->sm_rx_timer_counter = 0; port->sm_rx_state = AD_RX_CURRENT; } else { /* if timer is on, and if it is expired */ if (port->sm_rx_timer_counter && !(--port->sm_rx_timer_counter)) { switch (port->sm_rx_state) { case AD_RX_EXPIRED: port->sm_rx_state = AD_RX_DEFAULTED; break; case AD_RX_CURRENT: port->sm_rx_state = AD_RX_EXPIRED; break; default: break; } } else { /* if no lacpdu arrived and no timer is on */ switch (port->sm_rx_state) { case AD_RX_PORT_DISABLED: if (port->is_enabled && (port->sm_vars & AD_PORT_LACP_ENABLED)) port->sm_rx_state = AD_RX_EXPIRED; else if (port->is_enabled && ((port->sm_vars & AD_PORT_LACP_ENABLED) == 0)) port->sm_rx_state = AD_RX_LACP_DISABLED; break; default: break; } } } /* check if the State machine was changed or new lacpdu arrived */ if ((port->sm_rx_state != last_state) || (lacpdu)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Rx Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_rx_state); switch (port->sm_rx_state) { case AD_RX_INITIALIZE: if (!(port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS)) port->sm_vars &= ~AD_PORT_LACP_ENABLED; else port->sm_vars |= AD_PORT_LACP_ENABLED; port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; port->sm_rx_state = AD_RX_PORT_DISABLED; fallthrough; case AD_RX_PORT_DISABLED: port->sm_vars &= ~AD_PORT_MATCHED; break; case AD_RX_LACP_DISABLED: port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->partner_oper.port_state &= ~LACP_STATE_AGGREGATION; port->sm_vars |= AD_PORT_MATCHED; port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; case AD_RX_EXPIRED: /* Reset of the Synchronization flag (Standard 43.4.12) * This reset cause to disable this port in the * COLLECTING_DISTRIBUTING state of the mux machine in * case of EXPIRED even if LINK_DOWN didn't arrive for * the port. */ port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION; port->sm_vars &= ~AD_PORT_MATCHED; port->partner_oper.port_state |= LACP_STATE_LACP_TIMEOUT; port->partner_oper.port_state |= LACP_STATE_LACP_ACTIVITY; port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(AD_SHORT_TIMEOUT)); port->actor_oper_port_state |= LACP_STATE_EXPIRED; port->sm_vars |= AD_PORT_CHURNED; break; case AD_RX_DEFAULTED: __update_default_selected(port); __record_default(port); port->sm_vars |= AD_PORT_MATCHED; port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; case AD_RX_CURRENT: /* detect loopback situation */ if (MAC_ADDRESS_EQUAL(&(lacpdu->actor_system), &(port->actor_system))) { slave_err(port->slave->bond->dev, port->slave->dev, "An illegal loopback occurred on slave\n" "Check the configuration to verify that all adapters are connected to 802.3ad compliant switch ports\n"); return; } __update_selected(lacpdu, port); __update_ntt(lacpdu, port); __record_pdu(lacpdu, port); port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(port->actor_oper_port_state & LACP_STATE_LACP_TIMEOUT)); port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; default: break; } } } /** * ad_churn_machine - handle port churn's state machine * @port: the port we're looking at * */ static void ad_churn_machine(struct port *port) { if (port->sm_vars & AD_PORT_CHURNED) { port->sm_vars &= ~AD_PORT_CHURNED; port->sm_churn_actor_state = AD_CHURN_MONITOR; port->sm_churn_partner_state = AD_CHURN_MONITOR; port->sm_churn_actor_timer_counter = __ad_timer_to_ticks(AD_ACTOR_CHURN_TIMER, 0); port->sm_churn_partner_timer_counter = __ad_timer_to_ticks(AD_PARTNER_CHURN_TIMER, 0); return; } if (port->sm_churn_actor_timer_counter && !(--port->sm_churn_actor_timer_counter) && port->sm_churn_actor_state == AD_CHURN_MONITOR) { if (port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION) { port->sm_churn_actor_state = AD_NO_CHURN; } else { port->churn_actor_count++; port->sm_churn_actor_state = AD_CHURN; } } if (port->sm_churn_partner_timer_counter && !(--port->sm_churn_partner_timer_counter) && port->sm_churn_partner_state == AD_CHURN_MONITOR) { if (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) { port->sm_churn_partner_state = AD_NO_CHURN; } else { port->churn_partner_count++; port->sm_churn_partner_state = AD_CHURN; } } } /** * ad_tx_machine - handle a port's tx state machine * @port: the port we're looking at */ static void ad_tx_machine(struct port *port) { /* check if tx timer expired, to verify that we do not send more than * 3 packets per second */ if (port->sm_tx_timer_counter && !(--port->sm_tx_timer_counter)) { /* check if there is something to send */ if (port->ntt && (port->sm_vars & AD_PORT_LACP_ENABLED)) { __update_lacpdu_from_port(port); if (ad_lacpdu_send(port) >= 0) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Sent LACPDU on port %d\n", port->actor_port_number); /* mark ntt as false, so it will not be sent * again until demanded */ port->ntt = false; } } /* restart tx timer(to verify that we will not exceed * AD_MAX_TX_IN_SECOND */ port->sm_tx_timer_counter = ad_ticks_per_sec/AD_MAX_TX_IN_SECOND; } } /** * ad_periodic_machine - handle a port's periodic state machine * @port: the port we're looking at * @bond_params: bond parameters we will use * * Turn ntt flag on priodically to perform periodic transmission of lacpdu's. */ static void ad_periodic_machine(struct port *port, struct bond_params *bond_params) { periodic_states_t last_state; /* keep current state machine state to compare later if it was changed */ last_state = port->sm_periodic_state; /* check if port was reinitialized */ if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) || (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) || !bond_params->lacp_active) { port->sm_periodic_state = AD_NO_PERIODIC; } /* check if state machine should change state */ else if (port->sm_periodic_timer_counter) { /* check if periodic state machine expired */ if (!(--port->sm_periodic_timer_counter)) { /* if expired then do tx */ port->sm_periodic_state = AD_PERIODIC_TX; } else { /* If not expired, check if there is some new timeout * parameter from the partner state */ switch (port->sm_periodic_state) { case AD_FAST_PERIODIC: if (!(port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) port->sm_periodic_state = AD_SLOW_PERIODIC; break; case AD_SLOW_PERIODIC: if ((port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) { port->sm_periodic_timer_counter = 0; port->sm_periodic_state = AD_PERIODIC_TX; } break; default: break; } } } else { switch (port->sm_periodic_state) { case AD_NO_PERIODIC: port->sm_periodic_state = AD_FAST_PERIODIC; break; case AD_PERIODIC_TX: if (!(port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) port->sm_periodic_state = AD_SLOW_PERIODIC; else port->sm_periodic_state = AD_FAST_PERIODIC; break; default: break; } } /* check if the state machine was changed */ if (port->sm_periodic_state != last_state) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Periodic Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_periodic_state); switch (port->sm_periodic_state) { case AD_NO_PERIODIC: port->sm_periodic_timer_counter = 0; break; case AD_FAST_PERIODIC: /* decrement 1 tick we lost in the PERIODIC_TX cycle */ port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_FAST_PERIODIC_TIME))-1; break; case AD_SLOW_PERIODIC: /* decrement 1 tick we lost in the PERIODIC_TX cycle */ port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_SLOW_PERIODIC_TIME))-1; break; case AD_PERIODIC_TX: port->ntt = true; break; default: break; } } } /** * ad_port_selection_logic - select aggregation groups * @port: the port we're looking at * @update_slave_arr: Does slave array need update? * * Select aggregation groups, and assign each port for it's aggregetor. The * selection logic is called in the inititalization (after all the handshkes), * and after every lacpdu receive (if selected is off). */ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) { struct aggregator *aggregator, *free_aggregator = NULL, *temp_aggregator; struct port *last_port = NULL, *curr_port; struct list_head *iter; struct bonding *bond; struct slave *slave; int found = 0; /* if the port is already Selected, do nothing */ if (port->sm_vars & AD_PORT_SELECTED) return; bond = __get_bond_by_port(port); /* if the port is connected to other aggregator, detach it */ if (port->aggregator) { /* detach the port from its former aggregator */ temp_aggregator = port->aggregator; for (curr_port = temp_aggregator->lag_ports; curr_port; last_port = curr_port, curr_port = curr_port->next_port_in_aggregator) { if (curr_port == port) { temp_aggregator->num_of_ports--; /* if it is the first port attached to the * aggregator */ if (!last_port) { temp_aggregator->lag_ports = port->next_port_in_aggregator; } else { /* not the first port attached to the * aggregator */ last_port->next_port_in_aggregator = port->next_port_in_aggregator; } /* clear the port's relations to this * aggregator */ port->aggregator = NULL; port->next_port_in_aggregator = NULL; port->actor_port_aggregator_identifier = 0; slave_dbg(bond->dev, port->slave->dev, "Port %d left LAG %d\n", port->actor_port_number, temp_aggregator->aggregator_identifier); /* if the aggregator is empty, clear its * parameters, and set it ready to be attached */ if (!temp_aggregator->lag_ports) ad_clear_agg(temp_aggregator); break; } } if (!curr_port) { /* meaning: the port was related to an aggregator * but was not on the aggregator port list */ net_warn_ratelimited("%s: (slave %s): Warning: Port %d was related to aggregator %d but was not on its port list\n", port->slave->bond->dev->name, port->slave->dev->name, port->actor_port_number, port->aggregator->aggregator_identifier); } } /* search on all aggregators for a suitable aggregator for this port */ bond_for_each_slave(bond, slave, iter) { aggregator = &(SLAVE_AD_INFO(slave)->aggregator); /* keep a free aggregator for later use(if needed) */ if (!aggregator->lag_ports) { if (!free_aggregator) free_aggregator = aggregator; continue; } /* check if current aggregator suits us */ if (((aggregator->actor_oper_aggregator_key == port->actor_oper_port_key) && /* if all parameters match AND */ MAC_ADDRESS_EQUAL(&(aggregator->partner_system), &(port->partner_oper.system)) && (aggregator->partner_system_priority == port->partner_oper.system_priority) && (aggregator->partner_oper_aggregator_key == port->partner_oper.key) ) && ((__agg_has_partner(aggregator) && /* partner answers */ !aggregator->is_individual) /* but is not individual OR */ ) ) { /* attach to the founded aggregator */ port->aggregator = aggregator; port->actor_port_aggregator_identifier = port->aggregator->aggregator_identifier; port->next_port_in_aggregator = aggregator->lag_ports; port->aggregator->num_of_ports++; aggregator->lag_ports = port; slave_dbg(bond->dev, slave->dev, "Port %d joined LAG %d (existing LAG)\n", port->actor_port_number, port->aggregator->aggregator_identifier); /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; found = 1; break; } } /* the port couldn't find an aggregator - attach it to a new * aggregator */ if (!found) { if (free_aggregator) { /* assign port a new aggregator */ port->aggregator = free_aggregator; port->actor_port_aggregator_identifier = port->aggregator->aggregator_identifier; /* update the new aggregator's parameters * if port was responsed from the end-user */ if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS) /* if port is full duplex */ port->aggregator->is_individual = false; else port->aggregator->is_individual = true; port->aggregator->actor_admin_aggregator_key = port->actor_admin_port_key; port->aggregator->actor_oper_aggregator_key = port->actor_oper_port_key; port->aggregator->partner_system = port->partner_oper.system; port->aggregator->partner_system_priority = port->partner_oper.system_priority; port->aggregator->partner_oper_aggregator_key = port->partner_oper.key; port->aggregator->receive_state = 1; port->aggregator->transmit_state = 1; port->aggregator->lag_ports = port; port->aggregator->num_of_ports++; /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; slave_dbg(bond->dev, port->slave->dev, "Port %d joined LAG %d (new LAG)\n", port->actor_port_number, port->aggregator->aggregator_identifier); } else { slave_err(bond->dev, port->slave->dev, "Port %d did not find a suitable aggregator\n", port->actor_port_number); return; } } /* if all aggregator's ports are READY_N == TRUE, set ready=TRUE * in all aggregator's ports, else set ready=FALSE in all * aggregator's ports */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); aggregator = __get_first_agg(port); ad_agg_selection_logic(aggregator, update_slave_arr); if (!port->aggregator->is_active) port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; } /* Decide if "agg" is a better choice for the new active aggregator that * the current best, according to the ad_select policy. */ static struct aggregator *ad_agg_selection_test(struct aggregator *best, struct aggregator *curr) { /* 0. If no best, select current. * * 1. If the current agg is not individual, and the best is * individual, select current. * * 2. If current agg is individual and the best is not, keep best. * * 3. Therefore, current and best are both individual or both not * individual, so: * * 3a. If current agg partner replied, and best agg partner did not, * select current. * * 3b. If current agg partner did not reply and best agg partner * did reply, keep best. * * 4. Therefore, current and best both have partner replies or * both do not, so perform selection policy: * * BOND_AD_COUNT: Select by count of ports. If count is equal, * select by bandwidth. * * BOND_AD_STABLE, BOND_AD_BANDWIDTH: Select by bandwidth. */ if (!best) return curr; if (!curr->is_individual && best->is_individual) return curr; if (curr->is_individual && !best->is_individual) return best; if (__agg_has_partner(curr) && !__agg_has_partner(best)) return curr; if (!__agg_has_partner(curr) && __agg_has_partner(best)) return best; switch (__get_agg_selection_mode(curr->lag_ports)) { case BOND_AD_COUNT: if (__agg_active_ports(curr) > __agg_active_ports(best)) return curr; if (__agg_active_ports(curr) < __agg_active_ports(best)) return best; fallthrough; case BOND_AD_STABLE: case BOND_AD_BANDWIDTH: if (__get_agg_bandwidth(curr) > __get_agg_bandwidth(best)) return curr; break; default: net_warn_ratelimited("%s: (slave %s): Impossible agg select mode %d\n", curr->slave->bond->dev->name, curr->slave->dev->name, __get_agg_selection_mode(curr->lag_ports)); break; } return best; } static int agg_device_up(const struct aggregator *agg) { struct port *port = agg->lag_ports; if (!port) return 0; for (port = agg->lag_ports; port; port = port->next_port_in_aggregator) { if (netif_running(port->slave->dev) && netif_carrier_ok(port->slave->dev)) return 1; } return 0; } /** * ad_agg_selection_logic - select an aggregation group for a team * @agg: the aggregator we're looking at * @update_slave_arr: Does slave array need update? * * It is assumed that only one aggregator may be selected for a team. * * The logic of this function is to select the aggregator according to * the ad_select policy: * * BOND_AD_STABLE: select the aggregator with the most ports attached to * it, and to reselect the active aggregator only if the previous * aggregator has no more ports related to it. * * BOND_AD_BANDWIDTH: select the aggregator with the highest total * bandwidth, and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * * BOND_AD_COUNT: select the aggregator with largest number of ports * (slaves), and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * * FIXME: this function MUST be called with the first agg in the bond, or * __get_active_agg() won't work correctly. This function should be better * called with the bond itself, and retrieve the first agg from it. */ static void ad_agg_selection_logic(struct aggregator *agg, bool *update_slave_arr) { struct aggregator *best, *active, *origin; struct bonding *bond = agg->slave->bond; struct list_head *iter; struct slave *slave; struct port *port; rcu_read_lock(); origin = agg; active = __get_active_agg(agg); best = (active && agg_device_up(active)) ? active : NULL; bond_for_each_slave_rcu(bond, slave, iter) { agg = &(SLAVE_AD_INFO(slave)->aggregator); agg->is_active = 0; if (__agg_active_ports(agg) && agg_device_up(agg)) best = ad_agg_selection_test(best, agg); } if (best && __get_agg_selection_mode(best->lag_ports) == BOND_AD_STABLE) { /* For the STABLE policy, don't replace the old active * aggregator if it's still active (it has an answering * partner) or if both the best and active don't have an * answering partner. */ if (active && active->lag_ports && __agg_active_ports(active) && (__agg_has_partner(active) || (!__agg_has_partner(active) && !__agg_has_partner(best)))) { if (!(!active->actor_oper_aggregator_key && best->actor_oper_aggregator_key)) { best = NULL; active->is_active = 1; } } } if (best && (best == active)) { best = NULL; active->is_active = 1; } /* if there is new best aggregator, activate it */ if (best) { netdev_dbg(bond->dev, "(slave %s): best Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier, best->num_of_ports, best->actor_oper_aggregator_key, best->partner_oper_aggregator_key, best->is_individual, best->is_active); netdev_dbg(bond->dev, "(slave %s): best ports %p slave %p\n", best->slave ? best->slave->dev->name : "NULL", best->lag_ports, best->slave); bond_for_each_slave_rcu(bond, slave, iter) { agg = &(SLAVE_AD_INFO(slave)->aggregator); slave_dbg(bond->dev, slave->dev, "Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", agg->aggregator_identifier, agg->num_of_ports, agg->actor_oper_aggregator_key, agg->partner_oper_aggregator_key, agg->is_individual, agg->is_active); } /* check if any partner replies */ if (best->is_individual) net_warn_ratelimited("%s: Warning: No 802.3ad response from the link partner for any adapters in the bond\n", bond->dev->name); best->is_active = 1; netdev_dbg(bond->dev, "(slave %s): LAG %d chosen as the active LAG\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier); netdev_dbg(bond->dev, "(slave %s): Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier, best->num_of_ports, best->actor_oper_aggregator_key, best->partner_oper_aggregator_key, best->is_individual, best->is_active); /* disable the ports that were related to the former * active_aggregator */ if (active) { for (port = active->lag_ports; port; port = port->next_port_in_aggregator) { __disable_port(port); } } /* Slave array needs update. */ *update_slave_arr = true; } /* if the selected aggregator is of join individuals * (partner_system is NULL), enable their ports */ active = __get_active_agg(origin); if (active) { if (!__agg_has_partner(active)) { for (port = active->lag_ports; port; port = port->next_port_in_aggregator) { __enable_port(port); } *update_slave_arr = true; } } rcu_read_unlock(); bond_3ad_set_carrier(bond); } /** * ad_clear_agg - clear a given aggregator's parameters * @aggregator: the aggregator we're looking at */ static void ad_clear_agg(struct aggregator *aggregator) { if (aggregator) { aggregator->is_individual = false; aggregator->actor_admin_aggregator_key = 0; aggregator->actor_oper_aggregator_key = 0; eth_zero_addr(aggregator->partner_system.mac_addr_value); aggregator->partner_system_priority = 0; aggregator->partner_oper_aggregator_key = 0; aggregator->receive_state = 0; aggregator->transmit_state = 0; aggregator->lag_ports = NULL; aggregator->is_active = 0; aggregator->num_of_ports = 0; pr_debug("%s: LAG %d was cleared\n", aggregator->slave ? aggregator->slave->dev->name : "NULL", aggregator->aggregator_identifier); } } /** * ad_initialize_agg - initialize a given aggregator's parameters * @aggregator: the aggregator we're looking at */ static void ad_initialize_agg(struct aggregator *aggregator) { if (aggregator) { ad_clear_agg(aggregator); eth_zero_addr(aggregator->aggregator_mac_address.mac_addr_value); aggregator->aggregator_identifier = 0; aggregator->slave = NULL; } } /** * ad_initialize_port - initialize a given port's parameters * @port: the port we're looking at * @lacp_fast: boolean. whether fast periodic should be used */ static void ad_initialize_port(struct port *port, int lacp_fast) { static const struct port_params tmpl = { .system_priority = 0xffff, .key = 1, .port_number = 1, .port_priority = 0xff, .port_state = 1, }; static const struct lacpdu lacpdu = { .subtype = 0x01, .version_number = 0x01, .tlv_type_actor_info = 0x01, .actor_information_length = 0x14, .tlv_type_partner_info = 0x02, .partner_information_length = 0x14, .tlv_type_collector_info = 0x03, .collector_information_length = 0x10, .collector_max_delay = htons(AD_COLLECTOR_MAX_DELAY), }; if (port) { port->actor_port_priority = 0xff; port->actor_port_aggregator_identifier = 0; port->ntt = false; port->actor_admin_port_state = LACP_STATE_AGGREGATION | LACP_STATE_LACP_ACTIVITY; port->actor_oper_port_state = LACP_STATE_AGGREGATION | LACP_STATE_LACP_ACTIVITY; if (lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; memcpy(&port->partner_admin, &tmpl, sizeof(tmpl)); memcpy(&port->partner_oper, &tmpl, sizeof(tmpl)); port->is_enabled = true; /* private parameters */ port->sm_vars = AD_PORT_BEGIN | AD_PORT_LACP_ENABLED; port->sm_rx_state = 0; port->sm_rx_timer_counter = 0; port->sm_periodic_state = 0; port->sm_periodic_timer_counter = 0; port->sm_mux_state = 0; port->sm_mux_timer_counter = 0; port->sm_tx_state = 0; port->aggregator = NULL; port->next_port_in_aggregator = NULL; port->transaction_id = 0; port->sm_churn_actor_timer_counter = 0; port->sm_churn_actor_state = 0; port->churn_actor_count = 0; port->sm_churn_partner_timer_counter = 0; port->sm_churn_partner_state = 0; port->churn_partner_count = 0; memcpy(&port->lacpdu, &lacpdu, sizeof(lacpdu)); } } /** * ad_enable_collecting - enable a port's receive * @port: the port we're looking at * * Enable @port if it's in an active aggregator */ static void ad_enable_collecting(struct port *port) { if (port->aggregator->is_active) { struct slave *slave = port->slave; slave_dbg(slave->bond->dev, slave->dev, "Enabling collecting on port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __enable_collecting_port(port); } } /** * ad_disable_distributing - disable a port's transmit * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_disable_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator && __agg_has_partner(port->aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling distributing on port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __disable_distributing_port(port); /* Slave array needs an update */ *update_slave_arr = true; } } /** * ad_enable_collecting_distributing - enable a port's transmit/receive * @port: the port we're looking at * @update_slave_arr: Does slave array need update? * * Enable @port if it's in an active aggregator */ static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator->is_active) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Enabling port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __enable_port(port); /* Slave array needs update */ *update_slave_arr = true; } } /** * ad_disable_collecting_distributing - disable a port's transmit/receive * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator && __agg_has_partner(port->aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __disable_port(port); /* Slave array needs an update */ *update_slave_arr = true; } } /** * ad_marker_info_received - handle receive of a Marker information frame * @marker_info: Marker info received * @port: the port we're looking at */ static void ad_marker_info_received(struct bond_marker *marker_info, struct port *port) { struct bond_marker marker; atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.marker_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.marker_rx); /* copy the received marker data to the response marker */ memcpy(&marker, marker_info, sizeof(struct bond_marker)); /* change the marker subtype to marker response */ marker.tlv_type = AD_MARKER_RESPONSE_SUBTYPE; /* send the marker response */ if (ad_marker_send(port, &marker) >= 0) slave_dbg(port->slave->bond->dev, port->slave->dev, "Sent Marker Response on port %d\n", port->actor_port_number); } /** * ad_marker_response_received - handle receive of a marker response frame * @marker: marker PDU received * @port: the port we're looking at * * This function does nothing since we decided not to implement send and handle * response for marker PDU's, in this stage, but only to respond to marker * information. */ static void ad_marker_response_received(struct bond_marker *marker, struct port *port) { atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.marker_resp_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.marker_resp_rx); /* DO NOTHING, SINCE WE DECIDED NOT TO IMPLEMENT THIS FEATURE FOR NOW */ } /* ========= AD exported functions to the main bonding code ========= */ /* Check aggregators status in team every T seconds */ #define AD_AGGREGATOR_SELECTION_TIMER 8 /** * bond_3ad_initiate_agg_selection - initate aggregator selection * @bond: bonding struct * @timeout: timeout value to set * * Set the aggregation selection timer, to initiate an agg selection in * the very near future. Called during first initialization, and during * any down to up transitions of the bond. */ void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout) { atomic_set(&BOND_AD_INFO(bond).agg_select_timer, timeout); } /** * bond_3ad_initialize - initialize a bond's 802.3ad parameters and structures * @bond: bonding struct to work on * * Can be called only after the mac address of the bond is set. */ void bond_3ad_initialize(struct bonding *bond) { BOND_AD_INFO(bond).aggregator_identifier = 0; BOND_AD_INFO(bond).system.sys_priority = bond->params.ad_actor_sys_prio; if (is_zero_ether_addr(bond->params.ad_actor_system)) BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr); else BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->params.ad_actor_system); bond_3ad_initiate_agg_selection(bond, AD_AGGREGATOR_SELECTION_TIMER * ad_ticks_per_sec); } /** * bond_3ad_bind_slave - initialize a slave's port * @slave: slave struct to work on * * Returns: 0 on success * < 0 on error */ void bond_3ad_bind_slave(struct slave *slave) { struct bonding *bond = bond_get_bond_by_slave(slave); struct port *port; struct aggregator *aggregator; /* check that the slave has not been initialized yet. */ if (SLAVE_AD_INFO(slave)->port.slave != slave) { /* port initialization */ port = &(SLAVE_AD_INFO(slave)->port); ad_initialize_port(port, bond->params.lacp_fast); port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; /* key is determined according to the link speed, duplex and * user key */ port->actor_admin_port_key = bond->params.ad_user_port_key << 6; ad_update_actor_keys(port, false); /* actor system is the bond's system */ __ad_actor_update_port(port); /* tx timer(to verify that no more than MAX_TX_IN_SECOND * lacpdu's are sent in one second) */ port->sm_tx_timer_counter = ad_ticks_per_sec/AD_MAX_TX_IN_SECOND; __disable_port(port); /* aggregator initialization */ aggregator = &(SLAVE_AD_INFO(slave)->aggregator); ad_initialize_agg(aggregator); aggregator->aggregator_mac_address = *((struct mac_addr *)bond->dev->dev_addr); aggregator->aggregator_identifier = ++BOND_AD_INFO(bond).aggregator_identifier; aggregator->slave = slave; aggregator->is_active = 0; aggregator->num_of_ports = 0; } } /** * bond_3ad_unbind_slave - deinitialize a slave's port * @slave: slave struct to work on * * Search for the aggregator that is related to this port, remove the * aggregator and assign another aggregator for other port related to it * (if any), and remove the port. */ void bond_3ad_unbind_slave(struct slave *slave) { struct port *port, *prev_port, *temp_port; struct aggregator *aggregator, *new_aggregator, *temp_aggregator; int select_new_active_agg = 0; struct bonding *bond = slave->bond; struct slave *slave_iter; struct list_head *iter; bool dummy_slave_update; /* Ignore this value as caller updates array */ /* Sync against bond_3ad_state_machine_handler() */ spin_lock_bh(&bond->mode_lock); aggregator = &(SLAVE_AD_INFO(slave)->aggregator); port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(bond->dev, slave->dev, "Trying to unbind an uninitialized port\n"); goto out; } slave_dbg(bond->dev, slave->dev, "Unbinding Link Aggregation Group %d\n", aggregator->aggregator_identifier); /* Tell the partner that this port is not suitable for aggregation */ port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->actor_oper_port_state &= ~LACP_STATE_AGGREGATION; __update_lacpdu_from_port(port); ad_lacpdu_send(port); /* check if this aggregator is occupied */ if (aggregator->lag_ports) { /* check if there are other ports related to this aggregator * except the port related to this slave(thats ensure us that * there is a reason to search for new aggregator, and that we * will find one */ if ((aggregator->lag_ports != port) || (aggregator->lag_ports->next_port_in_aggregator)) { /* find new aggregator for the related port(s) */ bond_for_each_slave(bond, slave_iter, iter) { new_aggregator = &(SLAVE_AD_INFO(slave_iter)->aggregator); /* if the new aggregator is empty, or it is * connected to our port only */ if (!new_aggregator->lag_ports || ((new_aggregator->lag_ports == port) && !new_aggregator->lag_ports->next_port_in_aggregator)) break; } if (!slave_iter) new_aggregator = NULL; /* if new aggregator found, copy the aggregator's * parameters and connect the related lag_ports to the * new aggregator */ if ((new_aggregator) && ((!new_aggregator->lag_ports) || ((new_aggregator->lag_ports == port) && !new_aggregator->lag_ports->next_port_in_aggregator))) { slave_dbg(bond->dev, slave->dev, "Some port(s) related to LAG %d - replacing with LAG %d\n", aggregator->aggregator_identifier, new_aggregator->aggregator_identifier); if ((new_aggregator->lag_ports == port) && new_aggregator->is_active) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); select_new_active_agg = 1; } new_aggregator->is_individual = aggregator->is_individual; new_aggregator->actor_admin_aggregator_key = aggregator->actor_admin_aggregator_key; new_aggregator->actor_oper_aggregator_key = aggregator->actor_oper_aggregator_key; new_aggregator->partner_system = aggregator->partner_system; new_aggregator->partner_system_priority = aggregator->partner_system_priority; new_aggregator->partner_oper_aggregator_key = aggregator->partner_oper_aggregator_key; new_aggregator->receive_state = aggregator->receive_state; new_aggregator->transmit_state = aggregator->transmit_state; new_aggregator->lag_ports = aggregator->lag_ports; new_aggregator->is_active = aggregator->is_active; new_aggregator->num_of_ports = aggregator->num_of_ports; /* update the information that is written on * the ports about the aggregator */ for (temp_port = aggregator->lag_ports; temp_port; temp_port = temp_port->next_port_in_aggregator) { temp_port->aggregator = new_aggregator; temp_port->actor_port_aggregator_identifier = new_aggregator->aggregator_identifier; } ad_clear_agg(aggregator); if (select_new_active_agg) ad_agg_selection_logic(__get_first_agg(port), &dummy_slave_update); } else { slave_warn(bond->dev, slave->dev, "unbinding aggregator, and could not find a new aggregator for its ports\n"); } } else { /* in case that the only port related to this * aggregator is the one we want to remove */ select_new_active_agg = aggregator->is_active; ad_clear_agg(aggregator); if (select_new_active_agg) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); /* select new active aggregator */ temp_aggregator = __get_first_agg(port); if (temp_aggregator) ad_agg_selection_logic(temp_aggregator, &dummy_slave_update); } } } slave_dbg(bond->dev, slave->dev, "Unbinding port %d\n", port->actor_port_number); /* find the aggregator that this port is connected to */ bond_for_each_slave(bond, slave_iter, iter) { temp_aggregator = &(SLAVE_AD_INFO(slave_iter)->aggregator); prev_port = NULL; /* search the port in the aggregator's related ports */ for (temp_port = temp_aggregator->lag_ports; temp_port; prev_port = temp_port, temp_port = temp_port->next_port_in_aggregator) { if (temp_port == port) { /* the aggregator found - detach the port from * this aggregator */ if (prev_port) prev_port->next_port_in_aggregator = temp_port->next_port_in_aggregator; else temp_aggregator->lag_ports = temp_port->next_port_in_aggregator; temp_aggregator->num_of_ports--; if (__agg_active_ports(temp_aggregator) == 0) { select_new_active_agg = temp_aggregator->is_active; if (temp_aggregator->num_of_ports == 0) ad_clear_agg(temp_aggregator); if (select_new_active_agg) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); /* select new active aggregator */ ad_agg_selection_logic(__get_first_agg(port), &dummy_slave_update); } } break; } } } port->slave = NULL; out: spin_unlock_bh(&bond->mode_lock); } /** * bond_3ad_update_ad_actor_settings - reflect change of actor settings to ports * @bond: bonding struct to work on * * If an ad_actor setting gets changed we need to update the individual port * settings so the bond device will use the new values when it gets upped. */ void bond_3ad_update_ad_actor_settings(struct bonding *bond) { struct list_head *iter; struct slave *slave; ASSERT_RTNL(); BOND_AD_INFO(bond).system.sys_priority = bond->params.ad_actor_sys_prio; if (is_zero_ether_addr(bond->params.ad_actor_system)) BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr); else BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->params.ad_actor_system); spin_lock_bh(&bond->mode_lock); bond_for_each_slave(bond, slave, iter) { struct port *port = &(SLAVE_AD_INFO(slave))->port; __ad_actor_update_port(port); port->ntt = true; } spin_unlock_bh(&bond->mode_lock); } /** * bond_agg_timer_advance - advance agg_select_timer * @bond: bonding structure * * Return true when agg_select_timer reaches 0. */ static bool bond_agg_timer_advance(struct bonding *bond) { int val, nval; while (1) { val = atomic_read(&BOND_AD_INFO(bond).agg_select_timer); if (!val) return false; nval = val - 1; if (atomic_cmpxchg(&BOND_AD_INFO(bond).agg_select_timer, val, nval) == val) break; } return nval == 0; } /** * bond_3ad_state_machine_handler - handle state machines timeout * @work: work context to fetch bonding struct to work on from * * The state machine handling concept in this module is to check every tick * which state machine should operate any function. The execution order is * round robin, so when we have an interaction between state machines, the * reply of one to each other might be delayed until next tick. * * This function also complete the initialization when the agg_select_timer * times out, and it selects an aggregator for the ports that are yet not * related to any aggregator, and selects the active aggregator for a bond. */ void bond_3ad_state_machine_handler(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, ad_work.work); struct aggregator *aggregator; struct list_head *iter; struct slave *slave; struct port *port; bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER; bool update_slave_arr = false; /* Lock to protect data accessed by all (e.g., port->sm_vars) and * against running with bond_3ad_unbind_slave. ad_rx_machine may run * concurrently due to incoming LACPDU as well. */ spin_lock_bh(&bond->mode_lock); rcu_read_lock(); /* check if there are any slaves */ if (!bond_has_slaves(bond)) goto re_arm; if (bond_agg_timer_advance(bond)) { slave = bond_first_slave_rcu(bond); port = slave ? &(SLAVE_AD_INFO(slave)->port) : NULL; /* select the active aggregator for the bond */ if (port) { if (!port->slave) { net_warn_ratelimited("%s: Warning: bond's first port is uninitialized\n", bond->dev->name); goto re_arm; } aggregator = __get_first_agg(port); ad_agg_selection_logic(aggregator, &update_slave_arr); } bond_3ad_set_carrier(bond); } /* for each port run the state machines */ bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (!port->slave) { net_warn_ratelimited("%s: Warning: Found an uninitialized port\n", bond->dev->name); goto re_arm; } ad_rx_machine(NULL, port); ad_periodic_machine(port, &bond->params); ad_port_selection_logic(port, &update_slave_arr); ad_mux_machine(port, &update_slave_arr); ad_tx_machine(port); ad_churn_machine(port); /* turn off the BEGIN bit, since we already handled it */ if (port->sm_vars & AD_PORT_BEGIN) port->sm_vars &= ~AD_PORT_BEGIN; } re_arm: bond_for_each_slave_rcu(bond, slave, iter) { if (slave->should_notify) { should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW; break; } } rcu_read_unlock(); spin_unlock_bh(&bond->mode_lock); if (update_slave_arr) bond_slave_arr_work_rearm(bond, 0); if (should_notify_rtnl && rtnl_trylock()) { bond_slave_state_notify(bond); rtnl_unlock(); } queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks); } /** * bond_3ad_rx_indication - handle a received frame * @lacpdu: received lacpdu * @slave: slave struct to work on * * It is assumed that frames that were sent on this NIC don't returned as new * received frames (loopback). Since only the payload is given to this * function, it check for loopback. */ static int bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave) { struct bonding *bond = slave->bond; int ret = RX_HANDLER_ANOTHER; struct bond_marker *marker; struct port *port; atomic64_t *stat; port = &(SLAVE_AD_INFO(slave)->port); if (!port->slave) { net_warn_ratelimited("%s: Warning: port of slave %s is uninitialized\n", slave->dev->name, slave->bond->dev->name); return ret; } switch (lacpdu->subtype) { case AD_TYPE_LACPDU: ret = RX_HANDLER_CONSUMED; slave_dbg(slave->bond->dev, slave->dev, "Received LACPDU on port %d\n", port->actor_port_number); /* Protect against concurrent state machines */ spin_lock(&slave->bond->mode_lock); ad_rx_machine(lacpdu, port); spin_unlock(&slave->bond->mode_lock); break; case AD_TYPE_MARKER: ret = RX_HANDLER_CONSUMED; /* No need to convert fields to Little Endian since we * don't use the marker's fields. */ marker = (struct bond_marker *)lacpdu; switch (marker->tlv_type) { case AD_MARKER_INFORMATION_SUBTYPE: slave_dbg(slave->bond->dev, slave->dev, "Received Marker Information on port %d\n", port->actor_port_number); ad_marker_info_received(marker, port); break; case AD_MARKER_RESPONSE_SUBTYPE: slave_dbg(slave->bond->dev, slave->dev, "Received Marker Response on port %d\n", port->actor_port_number); ad_marker_response_received(marker, port); break; default: slave_dbg(slave->bond->dev, slave->dev, "Received an unknown Marker subtype on port %d\n", port->actor_port_number); stat = &SLAVE_AD_INFO(slave)->stats.marker_unknown_rx; atomic64_inc(stat); stat = &BOND_AD_INFO(bond).stats.marker_unknown_rx; atomic64_inc(stat); } break; default: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_unknown_rx); atomic64_inc(&BOND_AD_INFO(bond).stats.lacpdu_unknown_rx); } return ret; } /** * ad_update_actor_keys - Update the oper / admin keys for a port based on * its current speed and duplex settings. * * @port: the port we'are looking at * @reset: Boolean to just reset the speed and the duplex part of the key * * The logic to change the oper / admin keys is: * (a) A full duplex port can participate in LACP with partner. * (b) When the speed is changed, LACP need to be reinitiated. */ static void ad_update_actor_keys(struct port *port, bool reset) { u8 duplex = 0; u16 ospeed = 0, speed = 0; u16 old_oper_key = port->actor_oper_port_key; port->actor_admin_port_key &= ~(AD_SPEED_KEY_MASKS|AD_DUPLEX_KEY_MASKS); if (!reset) { speed = __get_link_speed(port); ospeed = (old_oper_key & AD_SPEED_KEY_MASKS) >> 1; duplex = __get_duplex(port); port->actor_admin_port_key |= (speed << 1) | duplex; } port->actor_oper_port_key = port->actor_admin_port_key; if (old_oper_key != port->actor_oper_port_key) { /* Only 'duplex' port participates in LACP */ if (duplex) port->sm_vars |= AD_PORT_LACP_ENABLED; else port->sm_vars &= ~AD_PORT_LACP_ENABLED; if (!reset) { if (!speed) { slave_err(port->slave->bond->dev, port->slave->dev, "speed changed to 0 on port %d\n", port->actor_port_number); } else if (duplex && ospeed != speed) { /* Speed change restarts LACP state-machine */ port->sm_vars |= AD_PORT_BEGIN; } } } } /** * bond_3ad_adapter_speed_duplex_changed - handle a slave's speed / duplex * change indication * * @slave: slave struct to work on * * Handle reselection of aggregator (if needed) for this port. */ void bond_3ad_adapter_speed_duplex_changed(struct slave *slave) { struct port *port; port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(slave->bond->dev, slave->dev, "speed/duplex changed for uninitialized port\n"); return; } spin_lock_bh(&slave->bond->mode_lock); ad_update_actor_keys(port, false); spin_unlock_bh(&slave->bond->mode_lock); slave_dbg(slave->bond->dev, slave->dev, "Port %d changed speed/duplex\n", port->actor_port_number); } /** * bond_3ad_handle_link_change - handle a slave's link status change indication * @slave: slave struct to work on * @link: whether the link is now up or down * * Handle reselection of aggregator (if needed) for this port. */ void bond_3ad_handle_link_change(struct slave *slave, char link) { struct aggregator *agg; struct port *port; bool dummy; port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(slave->bond->dev, slave->dev, "link status changed for uninitialized port\n"); return; } spin_lock_bh(&slave->bond->mode_lock); /* on link down we are zeroing duplex and speed since * some of the adaptors(ce1000.lan) report full duplex/speed * instead of N/A(duplex) / 0(speed). * * on link up we are forcing recheck on the duplex and speed since * some of he adaptors(ce1000.lan) report. */ if (link == BOND_LINK_UP) { port->is_enabled = true; ad_update_actor_keys(port, false); } else { /* link has failed */ port->is_enabled = false; ad_update_actor_keys(port, true); } agg = __get_first_agg(port); ad_agg_selection_logic(agg, &dummy); spin_unlock_bh(&slave->bond->mode_lock); slave_dbg(slave->bond->dev, slave->dev, "Port %d changed link status to %s\n", port->actor_port_number, link == BOND_LINK_UP ? "UP" : "DOWN"); /* RTNL is held and mode_lock is released so it's safe * to update slave_array here. */ bond_update_slave_arr(slave->bond, NULL); } /** * bond_3ad_set_carrier - set link state for bonding master * @bond: bonding structure * * if we have an active aggregator, we're up, if not, we're down. * Presumes that we cannot have an active aggregator if there are * no slaves with link up. * * This behavior complies with IEEE 802.3 section 43.3.9. * * Called by bond_set_carrier(). Return zero if carrier state does not * change, nonzero if it does. */ int bond_3ad_set_carrier(struct bonding *bond) { struct aggregator *active; struct slave *first_slave; int ret = 1; rcu_read_lock(); first_slave = bond_first_slave_rcu(bond); if (!first_slave) { ret = 0; goto out; } active = __get_active_agg(&(SLAVE_AD_INFO(first_slave)->aggregator)); if (active) { /* are enough slaves available to consider link up? */ if (__agg_active_ports(active) < bond->params.min_links) { if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); goto out; } } else if (!netif_carrier_ok(bond->dev)) { netif_carrier_on(bond->dev); goto out; } } else if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); } out: rcu_read_unlock(); return ret; } /** * __bond_3ad_get_active_agg_info - get information of the active aggregator * @bond: bonding struct to work on * @ad_info: ad_info struct to fill with the bond's info * * Returns: 0 on success * < 0 on error */ int __bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { struct aggregator *aggregator = NULL; struct list_head *iter; struct slave *slave; struct port *port; bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (port->aggregator && port->aggregator->is_active) { aggregator = port->aggregator; break; } } if (!aggregator) return -1; ad_info->aggregator_id = aggregator->aggregator_identifier; ad_info->ports = __agg_active_ports(aggregator); ad_info->actor_key = aggregator->actor_oper_aggregator_key; ad_info->partner_key = aggregator->partner_oper_aggregator_key; ether_addr_copy(ad_info->partner_system, aggregator->partner_system.mac_addr_value); return 0; } int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { int ret; rcu_read_lock(); ret = __bond_3ad_get_active_agg_info(bond, ad_info); rcu_read_unlock(); return ret; } int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct lacpdu *lacpdu, _lacpdu; if (skb->protocol != PKT_TYPE_LACPDU) return RX_HANDLER_ANOTHER; if (!MAC_ADDRESS_EQUAL(eth_hdr(skb)->h_dest, lacpdu_mcast_addr)) return RX_HANDLER_ANOTHER; lacpdu = skb_header_pointer(skb, 0, sizeof(_lacpdu), &_lacpdu); if (!lacpdu) { atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_illegal_rx); atomic64_inc(&BOND_AD_INFO(bond).stats.lacpdu_illegal_rx); return RX_HANDLER_ANOTHER; } return bond_3ad_rx_indication(lacpdu, slave); } /** * bond_3ad_update_lacp_rate - change the lacp rate * @bond: bonding struct * * When modify lacp_rate parameter via sysfs, * update actor_oper_port_state of each port. * * Hold bond->mode_lock, * so we can modify port->actor_oper_port_state, * no matter bond is up or down. */ void bond_3ad_update_lacp_rate(struct bonding *bond) { struct port *port = NULL; struct list_head *iter; struct slave *slave; int lacp_fast; lacp_fast = bond->params.lacp_fast; spin_lock_bh(&bond->mode_lock); bond_for_each_slave(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; else port->actor_oper_port_state &= ~LACP_STATE_LACP_TIMEOUT; } spin_unlock_bh(&bond->mode_lock); } size_t bond_3ad_stats_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_TX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_UNKNOWN_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_ILLEGAL_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_TX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_TX */ nla_total_size_64bit(sizeof(u64)); /* BOND_3AD_STAT_MARKER_UNKNOWN_RX */ } int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats) { u64 val; val = atomic64_read(&stats->lacpdu_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_unknown_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_UNKNOWN_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_illegal_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_ILLEGAL_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_resp_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_resp_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_unknown_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_UNKNOWN_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; return 0; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * These are the definitions needed for the tsnmap type. The tsnmap is used * to track out of order TSNs received. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Sridhar Samudrala <sri@us.ibm.com> */ #include <net/sctp/constants.h> #ifndef __sctp_tsnmap_h__ #define __sctp_tsnmap_h__ /* RFC 2960 12.2 Parameters necessary per association (i.e. the TCB) * Mapping An array of bits or bytes indicating which out of * Array order TSN's have been received (relative to the * Last Rcvd TSN). If no gaps exist, i.e. no out of * order packets have been received, this array * will be set to all zero. This structure may be * in the form of a circular buffer or bit array. */ struct sctp_tsnmap { /* This array counts the number of chunks with each TSN. * It points at one of the two buffers with which we will * ping-pong between. */ unsigned long *tsn_map; /* This is the TSN at tsn_map[0]. */ __u32 base_tsn; /* Last Rcvd : This is the last TSN received in * TSN : sequence. This value is set initially by * : taking the peer's Initial TSN, received in * : the INIT or INIT ACK chunk, and subtracting * : one from it. * * Throughout most of the specification this is called the * "Cumulative TSN ACK Point". In this case, we * ignore the advice in 12.2 in favour of the term * used in the bulk of the text. */ __u32 cumulative_tsn_ack_point; /* This is the highest TSN we've marked. */ __u32 max_tsn_seen; /* This is the minimum number of TSNs we can track. This corresponds * to the size of tsn_map. Note: the overflow_map allows us to * potentially track more than this quantity. */ __u16 len; /* Data chunks pending receipt. used by SCTP_STATUS sockopt */ __u16 pending_data; /* Record duplicate TSNs here. We clear this after * every SACK. Store up to SCTP_MAX_DUP_TSNS worth of * information. */ __u16 num_dup_tsns; __be32 dup_tsns[SCTP_MAX_DUP_TSNS]; }; struct sctp_tsnmap_iter { __u32 start; }; /* Initialize a block of memory as a tsnmap. */ struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *, __u16 len, __u32 initial_tsn, gfp_t gfp); void sctp_tsnmap_free(struct sctp_tsnmap *map); /* Test the tracking state of this TSN. * Returns: * 0 if the TSN has not yet been seen * >0 if the TSN has been seen (duplicate) * <0 if the TSN is invalid (too large to track) */ int sctp_tsnmap_check(const struct sctp_tsnmap *, __u32 tsn); /* Mark this TSN as seen. */ int sctp_tsnmap_mark(struct sctp_tsnmap *, __u32 tsn, struct sctp_transport *trans); /* Mark this TSN and all lower as seen. */ void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn); /* Retrieve the Cumulative TSN ACK Point. */ static inline __u32 sctp_tsnmap_get_ctsn(const struct sctp_tsnmap *map) { return map->cumulative_tsn_ack_point; } /* Retrieve the highest TSN we've seen. */ static inline __u32 sctp_tsnmap_get_max_tsn_seen(const struct sctp_tsnmap *map) { return map->max_tsn_seen; } /* How many duplicate TSNs are stored? */ static inline __u16 sctp_tsnmap_num_dups(struct sctp_tsnmap *map) { return map->num_dup_tsns; } /* Return pointer to duplicate tsn array as needed by SACK. */ static inline __be32 *sctp_tsnmap_get_dups(struct sctp_tsnmap *map) { map->num_dup_tsns = 0; return map->dup_tsns; } /* How many gap ack blocks do we have recorded? */ __u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map, struct sctp_gap_ack_block *gabs); /* Refresh the count on pending data. */ __u16 sctp_tsnmap_pending(struct sctp_tsnmap *map); /* Is there a gap in the TSN map? */ static inline int sctp_tsnmap_has_gap(const struct sctp_tsnmap *map) { return map->cumulative_tsn_ack_point != map->max_tsn_seen; } /* Mark a duplicate TSN. Note: limit the storage of duplicate TSN * information. */ static inline void sctp_tsnmap_mark_dup(struct sctp_tsnmap *map, __u32 tsn) { if (map->num_dup_tsns < SCTP_MAX_DUP_TSNS) map->dup_tsns[map->num_dup_tsns++] = htonl(tsn); } /* Renege a TSN that was seen. */ void sctp_tsnmap_renege(struct sctp_tsnmap *, __u32 tsn); /* Is there a gap in the TSN map? */ int sctp_tsnmap_has_gap(const struct sctp_tsnmap *); #endif /* __sctp_tsnmap_h__ */
101 102 3 3 80 80 106 107 107 105 72 71 66 66 3 3 227 222 3 91 251 252 91 91 4 4 4 4 4 2 2 2 2 3 3 6 2 5 5 5 5 2 2 2 2 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 /* * net/tipc/node.c: TIPC node management routines * * Copyright (c) 2000-2006, 2012-2016, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "link.h" #include "node.h" #include "name_distr.h" #include "socket.h" #include "bcast.h" #include "monitor.h" #include "discover.h" #include "netlink.h" #include "trace.h" #include "crypto.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 /* Flags used to take different actions according to flag type * TIPC_NOTIFY_NODE_DOWN: notify node is down * TIPC_NOTIFY_NODE_UP: notify node is up * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type */ enum { TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), TIPC_NOTIFY_LINK_UP = (1 << 6), TIPC_NOTIFY_LINK_DOWN = (1 << 7) }; struct tipc_link_entry { struct tipc_link *link; spinlock_t lock; /* per link */ u32 mtu; struct sk_buff_head inputq; struct tipc_media_addr maddr; }; struct tipc_bclink_entry { struct tipc_link *link; struct sk_buff_head inputq1; struct sk_buff_head arrvq; struct sk_buff_head inputq2; struct sk_buff_head namedq; u16 named_rcv_nxt; bool named_open; }; /** * struct tipc_node - TIPC node structure * @addr: network address of node * @kref: reference counter to node object * @lock: rwlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node * @bc_entry: broadcast link entry * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node * @preliminary: a preliminary node or not * @failover_sent: failover sent or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) * @link_cnt: number of links to node * @capabilities: bitmap, indicating peer node's functional capabilities * @signature: node instance identifier * @link_id: local and remote bearer ids of changing link, if any * @peer_id: 128-bit ID of peer * @peer_id_string: ID string of peer * @publ_list: list of publications * @conn_sks: list of connections (FIXME) * @timer: node's keepalive timer * @keepalive_intv: keepalive interval in milliseconds * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node * @peer_net: peer's net namespace * @peer_hash_mix: hash for this peer (FIXME) * @crypto_rx: RX crypto handler */ struct tipc_node { u32 addr; struct kref kref; rwlock_t lock; struct net *net; struct hlist_node hash; int active_links[2]; struct tipc_link_entry links[MAX_BEARERS]; struct tipc_bclink_entry bc_entry; int action_flags; struct list_head list; int state; bool preliminary; bool failover_sent; u16 sync_point; int link_cnt; u16 working_links; u16 capabilities; u32 signature; u32 link_id; u8 peer_id[16]; char peer_id_string[NODE_ID_STR_LEN]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; struct net *peer_net; u32 peer_hash_mix; #ifdef CONFIG_TIPC_CRYPTO struct tipc_crypto *crypto_rx; #endif }; /* Node FSM states and events: */ enum { SELF_DOWN_PEER_DOWN = 0xdd, SELF_UP_PEER_UP = 0xaa, SELF_DOWN_PEER_LEAVING = 0xd1, SELF_UP_PEER_COMING = 0xac, SELF_COMING_PEER_UP = 0xca, SELF_LEAVING_PEER_DOWN = 0x1d, NODE_FAILINGOVER = 0xf0, NODE_SYNCHING = 0xcc }; enum { SELF_ESTABL_CONTACT_EVT = 0xece, SELF_LOST_CONTACT_EVT = 0x1ce, PEER_ESTABL_CONTACT_EVT = 0x9ece, PEER_LOST_CONTACT_EVT = 0x91ce, NODE_FAILOVER_BEGIN_EVT = 0xfbe, NODE_FAILOVER_END_EVT = 0xfee, NODE_SYNCH_BEGIN_EVT = 0xcbe, NODE_SYNCH_END_EVT = 0xcee }; static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr); static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete); static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq); static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static bool node_is_up(struct tipc_node *n); static void tipc_node_delete_from_list(struct tipc_node *node); struct tipc_sock_conn { u32 port; u32 peer_port; u32 peer_node; struct list_head list; }; static struct tipc_link *node_active_link(struct tipc_node *n, int sel) { int bearer_id = n->active_links[sel & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) return NULL; return n->links[bearer_id].link; } int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; unsigned int mtu = MAX_MSG_SIZE; n = tipc_node_find(net, addr); if (unlikely(!n)) return mtu; /* Allow MAX_MSG_SIZE when building connection oriented message * if they are in the same core network */ if (n->peer_net && connected) { tipc_node_put(n); return mtu; } bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; tipc_node_put(n); return mtu; } bool tipc_node_get_id(struct net *net, u32 addr, u8 *id) { u8 *own_id = tipc_own_id(net); struct tipc_node *n; if (!own_id) return true; if (addr == tipc_own_addr(net)) { memcpy(id, own_id, TIPC_NODEID_LEN); return true; } n = tipc_node_find(net, addr); if (!n) return false; memcpy(id, &n->peer_id, TIPC_NODEID_LEN); tipc_node_put(n); return true; } u16 tipc_node_get_capabilities(struct net *net, u32 addr) { struct tipc_node *n; u16 caps; n = tipc_node_find(net, addr); if (unlikely(!n)) return TIPC_NODE_CAPABILITIES; caps = n->capabilities; tipc_node_put(n); return caps; } u32 tipc_node_get_addr(struct tipc_node *node) { return (node) ? node->addr : 0; } char *tipc_node_get_id_str(struct tipc_node *node) { return node->peer_id_string; } #ifdef CONFIG_TIPC_CRYPTO /** * tipc_node_crypto_rx - Retrieve crypto RX handle from node * @__n: target tipc_node * Note: node ref counter must be held first! */ struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) { return (__n) ? __n->crypto_rx : NULL; } struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) { return container_of(pos, struct tipc_node, list)->crypto_rx; } struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr) { struct tipc_node *n; n = tipc_node_find(net, addr); return (n) ? n->crypto_rx : NULL; } #endif static void tipc_node_free(struct rcu_head *rp) { struct tipc_node *n = container_of(rp, struct tipc_node, rcu); #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&n->crypto_rx); #endif kfree(n); } static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); kfree(n->bc_entry.link); call_rcu(&n->rcu, tipc_node_free); } void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } void tipc_node_get(struct tipc_node *node) { kref_get(&node->kref); } /* * tipc_node_find - locate specified node object, if it exists */ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node; unsigned int thash = tipc_hashfn(addr); rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { if (node->addr != addr || node->preliminary) continue; if (!kref_get_unless_zero(&node->kref)) node = NULL; break; } rcu_read_unlock(); return node; } /* tipc_node_find_by_id - locate specified node object by its 128-bit id * Note: this function is called only when a discovery request failed * to find the node by its 32-bit id, and is not time critical */ static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool found = false; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { read_lock_bh(&n->lock); if (!memcmp(id, n->peer_id, 16) && kref_get_unless_zero(&n->kref)) found = true; read_unlock_bh(&n->lock); if (found) break; } rcu_read_unlock(); return found ? n : NULL; } static void tipc_node_read_lock(struct tipc_node *n) __acquires(n->lock) { read_lock_bh(&n->lock); } static void tipc_node_read_unlock(struct tipc_node *n) __releases(n->lock) { read_unlock_bh(&n->lock); } static void tipc_node_write_lock(struct tipc_node *n) __acquires(n->lock) { write_lock_bh(&n->lock); } static void tipc_node_write_unlock_fast(struct tipc_node *n) __releases(n->lock) { write_unlock_bh(&n->lock); } static void tipc_node_write_unlock(struct tipc_node *n) __releases(n->lock) { struct tipc_socket_addr sk; struct net *net = n->net; u32 flags = n->action_flags; struct list_head *publ_list; struct tipc_uaddr ua; u32 bearer_id, node; if (likely(!flags)) { write_unlock_bh(&n->lock); return; } tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, TIPC_LINK_STATE, n->addr, n->addr); sk.ref = n->link_id; sk.node = tipc_own_addr(net); node = n->addr; bearer_id = n->link_id & 0xffff; publ_list = &n->publ_list; n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP); write_unlock_bh(&n->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) tipc_publ_notify(net, publ_list, node, n->capabilities); if (flags & TIPC_NOTIFY_NODE_UP) tipc_named_node_up(net, node, n->capabilities); if (flags & TIPC_NOTIFY_LINK_UP) { tipc_mon_peer_up(net, node, bearer_id); tipc_nametbl_publish(net, &ua, &sk, sk.ref); } if (flags & TIPC_NOTIFY_LINK_DOWN) { tipc_mon_peer_down(net, node, bearer_id); tipc_nametbl_withdraw(net, &ua, &sk, sk.ref); } } static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) { int net_id = tipc_netid(n->net); struct tipc_net *tn_peer; struct net *tmp; u32 hash_chk; if (n->peer_net) return; for_each_net_rcu(tmp) { tn_peer = tipc_net(tmp); if (!tn_peer) continue; /* Integrity checking whether node exists in namespace or not */ if (tn_peer->net_id != net_id) continue; if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) continue; hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); if (hash_mixes ^ hash_chk) continue; n->peer_net = tmp; n->peer_hash_mix = hash_mixes; break; } } struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, u16 capabilities, u32 hash_mixes, bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l, *snd_l = tipc_bc_sndlink(net); struct tipc_node *n, *temp_node; unsigned long intv; int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr) ?: tipc_node_find_by_id(net, peer_id); if (n) { if (!n->preliminary) goto update; if (preliminary) goto exit; /* A preliminary node becomes "real" now, refresh its data */ tipc_node_write_lock(n); if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link refresh failed, no memory\n"); tipc_node_write_unlock_fast(n); tipc_node_put(n); n = NULL; goto exit; } n->preliminary = false; n->addr = addr; hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_del_rcu(&n->list); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); tipc_node_write_unlock_fast(n); update: if (n->peer_hash_mix ^ hash_mixes) tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ tipc_node_write_lock(n); n->capabilities = capabilities; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { l = n->links[bearer_id].link; if (l) tipc_link_update_caps(l, capabilities); } tipc_node_write_unlock_fast(n); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { pr_warn("Node creation failed, no memory\n"); goto exit; } tipc_nodeid2string(n->peer_id_string, peer_id); #ifdef CONFIG_TIPC_CRYPTO if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) { pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string); kfree(n); n = NULL; goto exit; } #endif n->addr = addr; n->preliminary = preliminary; memcpy(&n->peer_id, peer_id, 16); n->net = net; n->peer_net = NULL; n->peer_hash_mix = 0; /* Assign kernel local namespace if exists */ tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); INIT_HLIST_NODE(&n->hash); INIT_LIST_HEAD(&n->list); INIT_LIST_HEAD(&n->publ_list); INIT_LIST_HEAD(&n->conn_sks); skb_queue_head_init(&n->bc_entry.namedq); skb_queue_head_init(&n->bc_entry.inputq1); __skb_queue_head_init(&n->bc_entry.arrvq); skb_queue_head_init(&n->bc_entry.inputq2); for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); n->state = SELF_DOWN_PEER_LEAVING; n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; if (!preliminary && !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link creation failed, no memory\n"); tipc_node_put(n); n = NULL; goto exit; } tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); /* Start a slow timer anyway, crypto needs it */ n->keepalive_intv = 10000; intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); return n; } static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) { unsigned long tol = tipc_link_tolerance(l); unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; /* Link with lowest tolerance determines timer interval */ if (intv < n->keepalive_intv) n->keepalive_intv = intv; /* Ensure link's abort limit corresponds to current tolerance */ tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } static void tipc_node_delete_from_list(struct tipc_node *node) { #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_key_flush(node->crypto_rx); #endif list_del_rcu(&node->list); hlist_del_rcu(&node->hash); tipc_node_put(node); } static void tipc_node_delete(struct tipc_node *node) { trace_tipc_node_delete(node, true, " "); tipc_node_delete_from_list(node); del_timer_sync(&node->timer); tipc_node_put(node); } void tipc_node_stop(struct net *net) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_safe(node, t_node, &tn->node_list, list) tipc_node_delete(node); spin_unlock_bh(&tn->node_list_lock); } void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node subscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_add_tail(subscr, &n->publ_list); tipc_node_write_unlock_fast(n); tipc_node_put(n); } void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node unsubscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_del_init(subscr); tipc_node_write_unlock_fast(n); tipc_node_put(n); } int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) { struct tipc_node *node; struct tipc_sock_conn *conn; int err = 0; if (in_own_node(net, dnode)) return 0; node = tipc_node_find(net, dnode); if (!node) { pr_warn("Connecting sock to node 0x%x failed\n", dnode); return -EHOSTUNREACH; } conn = kmalloc(sizeof(*conn), GFP_ATOMIC); if (!conn) { err = -EHOSTUNREACH; goto exit; } conn->peer_node = dnode; conn->port = port; conn->peer_port = peer_port; tipc_node_write_lock(node); list_add_tail(&conn->list, &node->conn_sks); tipc_node_write_unlock(node); exit: tipc_node_put(node); return err; } void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) { struct tipc_node *node; struct tipc_sock_conn *conn, *safe; if (in_own_node(net, dnode)) return; node = tipc_node_find(net, dnode); if (!node) return; tipc_node_write_lock(node); list_for_each_entry_safe(conn, safe, &node->conn_sks, list) { if (port != conn->port) continue; list_del(&conn->list); kfree(conn); } tipc_node_write_unlock(node); tipc_node_put(node); } static void tipc_node_clear_links(struct tipc_node *node) { int i; for (i = 0; i < MAX_BEARERS; i++) { struct tipc_link_entry *le = &node->links[i]; if (le->link) { kfree(le->link); le->link = NULL; node->link_cnt--; } } } /* tipc_node_cleanup - delete nodes that does not * have active links for NODE_CLEANUP_AFTER time */ static bool tipc_node_cleanup(struct tipc_node *peer) { struct tipc_node *temp_node; struct tipc_net *tn = tipc_net(peer->net); bool deleted = false; /* If lock held by tipc_node_stop() the node will be deleted anyway */ if (!spin_trylock_bh(&tn->node_list_lock)) return false; tipc_node_write_lock(peer); if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { tipc_node_clear_links(peer); tipc_node_delete_from_list(peer); deleted = true; } tipc_node_write_unlock(peer); if (!deleted) { spin_unlock_bh(&tn->node_list_lock); return deleted; } /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(peer->net, (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } /* tipc_node_timeout - handle expiration of node timer */ static void tipc_node_timeout(struct timer_list *t) { struct tipc_node *n = from_timer(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; int remains = n->link_cnt; int bearer_id; int rc = 0; trace_tipc_node_timeout(n, false, " "); if (!node_is_up(n) && tipc_node_cleanup(n)) { /*Removing the reference of Timer*/ tipc_node_put(n); return; } #ifdef CONFIG_TIPC_CRYPTO /* Take any crypto key related actions first */ tipc_crypto_timeout(n->crypto_rx); #endif __skb_queue_head_init(&xmitq); /* Initial node interval to value larger (10 seconds), then it will be * recalculated with link lowest tolerance */ tipc_node_read_lock(n); n->keepalive_intv = 10000; tipc_node_read_unlock(n); for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; if (le->link) { spin_lock_bh(&le->lock); /* Link tolerance may change asynchronously: */ tipc_node_calculate_timer(n, le->link); rc = tipc_link_timeout(le->link, &xmitq); spin_unlock_bh(&le->lock); remains--; } tipc_node_read_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv)); } /** * __tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * Node lock must be held by caller * Link becomes active (alone or shared) or standby, depending on its priority. */ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; struct tipc_link *ol = node_active_link(n, 0); struct tipc_link *nl = n->links[bearer_id].link; if (!nl || tipc_link_is_up(nl)) return; tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT); if (!tipc_link_is_up(nl)) return; n->working_links++; n->action_flags |= TIPC_NOTIFY_LINK_UP; n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ n->links[bearer_id].mtu = tipc_link_mss(nl); tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); pr_debug("Established link <%s> on network plane %c\n", tipc_link_name(nl), tipc_link_plane(nl)); trace_tipc_node_link_up(n, true, " "); /* Ensure that a STATE message goes first */ tipc_link_build_state_msg(nl, xmitq); /* First link? => give it both slots */ if (!ol) { *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); return; } /* Second link => redistribute slots */ if (tipc_link_prio(nl) > tipc_link_prio(ol)) { pr_debug("Old link <%s> becomes standby\n", tipc_link_name(ol)); *slot0 = bearer_id; *slot1 = bearer_id; tipc_link_set_active(nl, true); tipc_link_set_active(ol, false); } else if (tipc_link_prio(nl) == tipc_link_prio(ol)) { tipc_link_set_active(nl, true); *slot1 = bearer_id; } else { pr_debug("New link <%s> is standby\n", tipc_link_name(nl)); } /* Prepare synchronization with first link */ tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq); } /** * tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * * Link becomes active (alone or shared) or standby, depending on its priority. */ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_media_addr *maddr; tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); maddr = &n->links[bearer_id].maddr; tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n); tipc_node_write_unlock(n); } /** * tipc_node_link_failover() - start failover in case "half-failover" * * This function is only called in a very special situation where link * failover can be already started on peer node but not on this node. * This can happen when e.g.:: * * 1. Both links <1A-2A>, <1B-2B> down * 2. Link endpoint 2A up, but 1A still down (e.g. due to network * disturbance, wrong session, etc.) * 3. Link <1B-2B> up * 4. Link endpoint 2A down (e.g. due to link tolerance timeout) * 5. Node 2 starts failover onto link <1B-2B> * * ==> Node 1 does never start link/node failover! * * @n: tipc node structure * @l: link peer endpoint failingover (- can be NULL) * @tnl: tunnel link * @xmitq: queue for messages to be xmited on tnl link later */ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l, struct tipc_link *tnl, struct sk_buff_head *xmitq) { /* Avoid to be "self-failover" that can never end */ if (!tipc_link_is_up(tnl)) return; /* Don't rush, failure link may be in the process of resetting */ if (l && !tipc_link_is_reset(l)) return; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_failover_prepare(l, tnl, xmitq); if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); } /** * __tipc_node_link_down - handle loss of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * @maddr: output media address of the bearer */ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr) { struct tipc_link_entry *le = &n->links[*bearer_id]; int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; int i, highest = 0, prio; struct tipc_link *l, *_l, *tnl; l = n->links[*bearer_id].link; if (!l || tipc_link_is_reset(l)) return; n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; n->link_id = tipc_link_id(l); tipc_bearer_remove_dest(n->net, *bearer_id, n->addr); pr_debug("Lost link <%s> on network plane %c\n", tipc_link_name(l), tipc_link_plane(l)); /* Select new active link if any available */ *slot0 = INVALID_BEARER_ID; *slot1 = INVALID_BEARER_ID; for (i = 0; i < MAX_BEARERS; i++) { _l = n->links[i].link; if (!_l || !tipc_link_is_up(_l)) continue; if (_l == l) continue; prio = tipc_link_prio(_l); if (prio < highest) continue; if (prio > highest) { highest = prio; *slot0 = i; *slot1 = i; continue; } *slot1 = i; } if (!node_is_up(n)) { if (tipc_link_peer_is_down(l)) tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down!"); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_reset(l); tipc_link_build_reset_msg(l, xmitq); *maddr = &n->links[*bearer_id].maddr; node_lost_contact(n, &le->inputq); tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); return; } tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); /* There is still a working link => initiate failover */ *bearer_id = n->active_links[0]; tnl = n->links[*bearer_id].link; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down -> failover!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); *maddr = &n->links[*bearer_id].maddr; } static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) { struct tipc_link_entry *le = &n->links[bearer_id]; struct tipc_media_addr *maddr = NULL; struct tipc_link *l = le->link; int old_bearer_id = bearer_id; struct sk_buff_head xmitq; if (!l) return; __skb_queue_head_init(&xmitq); tipc_node_write_lock(n); if (!tipc_link_is_establishing(l)) { __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); } else { /* Defuse pending tipc_node_link_up() */ tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); } if (delete) { kfree(l); le->link = NULL; n->link_cnt--; } trace_tipc_node_link_down(n, true, "node link down or deleted!"); tipc_node_write_unlock(n); if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n); tipc_sk_rcv(n->net, &le->inputq); } static bool node_is_up(struct tipc_node *n) { return n->active_links[0] != INVALID_BEARER_ID; } bool tipc_node_is_up(struct net *net, u32 addr) { struct tipc_node *n; bool retval = false; if (in_own_node(net, addr)) return true; n = tipc_node_find(net, addr); if (!n) return false; retval = node_is_up(n); tipc_node_put(n); return retval; } static u32 tipc_node_suggest_addr(struct net *net, u32 addr) { struct tipc_node *n; addr ^= tipc_net(net)->random; while ((n = tipc_node_find(net, addr))) { tipc_node_put(n); addr++; } return addr; } /* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not * Returns suggested address if any, otherwise 0 */ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool preliminary; u32 sugg_addr; /* Suggest new address if some other peer is using this one */ n = tipc_node_find(net, addr); if (n) { if (!memcmp(n->peer_id, id, NODE_ID_LEN)) addr = 0; tipc_node_put(n); if (!addr) return 0; return tipc_node_suggest_addr(net, addr); } /* Suggest previously used address if peer is known */ n = tipc_node_find_by_id(net, id); if (n) { sugg_addr = n->addr; preliminary = n->preliminary; tipc_node_put(n); if (!preliminary) return sugg_addr; } /* Even this node may be in conflict */ if (tn->trial_addr == addr) return tipc_node_suggest_addr(net, addr); return 0; } void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { struct tipc_node *n; struct tipc_link *l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; bool link_up = false; bool link_is_reset = false; bool accept_addr = false; bool reset = false; char *if_name; unsigned long intv; u16 session; *dupl_addr = false; *respond = false; n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes, false); if (!n) return; tipc_node_write_lock(n); le = &n->links[b->identity]; /* Prepare to validate requesting node's signature and media address */ l = le->link; link_up = l && tipc_link_is_up(l); link_is_reset = l && tipc_link_is_reset(l); addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); sign_match = (signature == n->signature); /* These three flags give us eight permutations: */ if (sign_match && addr_match && link_up) { /* All is fine. Ignore requests. */ /* Peer node is not a container/local namespace */ if (!n->peer_hash_mix) n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; } else if (sign_match && !addr_match && link_up) { /* Peer has changed i/f address without rebooting. * If so, the link will reset soon, and the next * discovery will be accepted. So we can ignore it. * It may also be a cloned or malicious peer having * chosen the same node address and signature as an * existing one. * Ignore requests until the link goes down, if ever. */ *dupl_addr = true; } else if (sign_match && !addr_match && !link_up) { /* Peer link has changed i/f address without rebooting. * It may also be a cloned or malicious peer; we can't * distinguish between the two. * The signature is correct, so we must accept. */ accept_addr = true; *respond = true; reset = true; } else if (!sign_match && addr_match && link_up) { /* Peer node rebooted. Two possibilities: * - Delayed re-discovery; this link endpoint has already * reset and re-established contact with the peer, before * receiving a discovery message from that node. * (The peer happened to receive one from this node first). * - The peer came back so fast that our side has not * discovered it yet. Probing from this side will soon * reset the link, since there can be no working link * endpoint at the peer end, and the link will re-establish. * Accept the signature, since it comes from a known peer. */ n->signature = signature; } else if (!sign_match && addr_match && !link_up) { /* The peer node has rebooted. * Accept signature, since it is a known peer. */ n->signature = signature; *respond = true; } else if (!sign_match && !addr_match && link_up) { /* Peer rebooted with new address, or a new/duplicate peer. * Ignore until the link goes down, if ever. */ *dupl_addr = true; } else if (!sign_match && !addr_match && !link_up) { /* Peer rebooted with new address, or it is a new peer. * Accept signature and address. */ n->signature = signature; accept_addr = true; *respond = true; reset = true; } if (!accept_addr) goto exit; /* Now create new link if not already existing */ if (!l) { if (n->link_cnt == 2) goto exit; if_name = strchr(b->name, ':') + 1; get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, &le->inputq, &n->bc_entry.namedq, &l)) { *respond = false; goto exit; } trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link created!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); link_is_reset = tipc_link_is_reset(l); le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); if (n->link_cnt == 1) { intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); } } memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); if (reset && !link_is_reset) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } void tipc_node_delete_links(struct net *net, int bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_link_down(n, bearer_id, true); } rcu_read_unlock(); } static void tipc_node_reset_links(struct tipc_node *n) { int i; pr_warn("Resetting all links to %x\n", n->addr); trace_tipc_node_reset_links(n, true, " "); for (i = 0; i < MAX_BEARERS; i++) { tipc_node_link_down(n, i, false); } } /* tipc_node_fsm_evt - node finite state machine * Determines when contact is allowed with peer node */ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) { int state = n->state; switch (state) { case SELF_DOWN_PEER_DOWN: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_COMING; break; case PEER_ESTABL_CONTACT_EVT: state = SELF_COMING_PEER_UP; break; case SELF_LOST_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_UP: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_BEGIN_EVT: state = NODE_SYNCHING; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_END_EVT: break; default: goto illegal_evt; } break; case SELF_DOWN_PEER_LEAVING: switch (evt) { case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case SELF_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_COMING: switch (evt) { case PEER_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_BEGIN_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_COMING_PEER_UP: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_LEAVING_PEER_DOWN: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case NODE_FAILINGOVER: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_FAILOVER_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_SYNCH_END_EVT: default: goto illegal_evt; } break; case NODE_SYNCHING: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case NODE_SYNCH_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; default: pr_err("Unknown node fsm state %x\n", state); break; } trace_tipc_node_fsm(n->peer_id, n->state, state, evt); n->state = state; return; illegal_evt: pr_err("Illegal node fsm evt %x in state %x\n", evt, state); trace_tipc_node_fsm(n->peer_id, n->state, state, evt); } static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq) { struct tipc_sock_conn *conn, *safe; struct tipc_link *l; struct list_head *conns = &n->conn_sks; struct sk_buff *skb; uint i; pr_debug("Lost contact with %x\n", n->addr); n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); trace_tipc_node_lost_contact(n, true, " "); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT); } /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; n->peer_net = NULL; n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, tipc_own_addr(n->net), conn->peer_node, conn->port, conn->peer_port, TIPC_ERR_NO_NODE); if (likely(skb)) skb_queue_tail(inputq, skb); list_del(&conn->list); kfree(conn); } } /** * tipc_node_get_linkname - get the name of a link * * @net: the applicable net namespace * @bearer_id: id of the bearer * @addr: peer node address * @linkname: link name output buffer * @len: size of @linkname output buffer * * Return: 0 on success */ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) { struct tipc_link *link; int err = -EINVAL; struct tipc_node *node = tipc_node_find(net, addr); if (!node) return err; if (bearer_id >= MAX_BEARERS) goto exit; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { strncpy(linkname, tipc_link_name(link), len); err = 0; } tipc_node_read_unlock(node); exit: tipc_node_put(node); return err; } /* Caller should hold node lock for the passed node */ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) { void *hdr; struct nlattr *attrs; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NODE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NODE); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr)) goto attr_msg_full; if (node_is_up(node)) if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) { struct tipc_msg *hdr = buf_msg(skb_peek(list)); struct sk_buff_head inputq; switch (msg_user(hdr)) { case TIPC_LOW_IMPORTANCE: case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: if (msg_connected(hdr) || msg_named(hdr) || msg_direct(hdr)) { tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; } if (msg_mcast(hdr)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); return; } return; case MSG_FRAGMENTER: if (tipc_msg_assemble(list)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); } return; case GROUP_PROTOCOL: case CONN_MANAGER: tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; case LINK_PROTOCOL: case NAME_DISTRIBUTOR: case TUNNEL_PROTOCOL: case BCAST_PROTOCOL: return; default: return; } } /** * tipc_node_xmit() - general link level function for message sending * @net: the applicable net namespace * @list: chain of buffers containing message * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain. * Return: 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) { struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; bool node_up = false; struct net *peer_net; int bearer_id; int rc; if (in_own_node(net, dnode)) { tipc_loopback_trace(net, list); spin_lock_init(&list->lock); tipc_sk_rcv(net, list); return 0; } n = tipc_node_find(net, dnode); if (unlikely(!n)) { __skb_queue_purge(list); return -EHOSTUNREACH; } rcu_read_lock(); tipc_node_read_lock(n); node_up = node_is_up(n); peer_net = n->peer_net; tipc_node_read_unlock(n); if (node_up && peer_net && check_net(peer_net)) { /* xmit inner linux container */ tipc_lxc_xmit(peer_net, list); if (likely(skb_queue_empty(list))) { rcu_read_unlock(); tipc_node_put(n); return 0; } } rcu_read_unlock(); tipc_node_read_lock(n); bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); tipc_node_put(n); __skb_queue_purge(list); return -EHOSTUNREACH; } __skb_queue_head_init(&xmitq); le = &n->links[bearer_id]; spin_lock_bh(&le->lock); rc = tipc_link_xmit(le->link, list, &xmitq); spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); else tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); return rc; } /* tipc_node_xmit_skb(): send single buffer to destination * Buffers sent via this function are generally TIPC_SYSTEM_IMPORTANCE * messages, which will not be rejected * The only exception is datagram messages rerouted after secondary * lookup, which are rare and safe to dispose of anyway. */ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, u32 selector) { struct sk_buff_head head; __skb_queue_head_init(&head); __skb_queue_tail(&head, skb); tipc_node_xmit(net, &head, dnode, selector); return 0; } /* tipc_node_distr_xmit(): send single buffer msgs to individual destinations * Note: this is only for SYSTEM_IMPORTANCE messages, which cannot be rejected */ int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) { struct sk_buff *skb; u32 selector, dnode; while ((skb = __skb_dequeue(xmitq))) { selector = msg_origport(buf_msg(skb)); dnode = msg_destnode(buf_msg(skb)); tipc_node_xmit_skb(net, skb, dnode, selector); } return 0; } void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests) { struct sk_buff_head xmitq; struct sk_buff *txskb; struct tipc_node *n; u16 dummy; u32 dst; /* Use broadcast if all nodes support it */ if (!rc_dests && tipc_bcast_get_mode(net) != BCLINK_MODE_RCAST) { __skb_queue_head_init(&xmitq); __skb_queue_tail(&xmitq, skb); tipc_bcast_xmit(net, &xmitq, &dummy); return; } /* Otherwise use legacy replicast method */ rcu_read_lock(); list_for_each_entry_rcu(n, tipc_nodes(net), list) { dst = n->addr; if (in_own_node(net, dst)) continue; if (!node_is_up(n)) continue; txskb = pskb_copy(skb, GFP_ATOMIC); if (!txskb) break; msg_set_destnode(buf_msg(txskb), dst); tipc_node_xmit_skb(net, txskb, dst, 0); } rcu_read_unlock(); kfree_skb(skb); } static void tipc_node_mcast_rcv(struct tipc_node *n) { struct tipc_bclink_entry *be = &n->bc_entry; /* 'arrvq' is under inputq2's lock protection */ spin_lock_bh(&be->inputq2.lock); spin_lock_bh(&be->inputq1.lock); skb_queue_splice_tail_init(&be->inputq1, &be->arrvq); spin_unlock_bh(&be->inputq1.lock); spin_unlock_bh(&be->inputq2.lock); tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2); } static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_link *ucl; int rc; rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq); if (rc & TIPC_LINK_DOWN_EVT) { tipc_node_reset_links(n); return; } if (!(rc & TIPC_LINK_SND_STATE)) return; /* If probe message, a STATE response will be sent anyway */ if (msg_probe(hdr)) return; /* Produce a STATE message carrying broadcast NACK */ tipc_node_read_lock(n); ucl = n->links[bearer_id].link; if (ucl) tipc_link_build_state_msg(ucl, xmitq); tipc_node_read_unlock(n); } /** * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @bearer_id: id of bearer message arrived on * * Invoked with no locks held. */ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id) { int rc; struct sk_buff_head xmitq; struct tipc_bclink_entry *be; struct tipc_link_entry *le; struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); u32 dnode = msg_destnode(hdr); struct tipc_node *n; __skb_queue_head_init(&xmitq); /* If NACK for other node, let rcv link for that node peek into it */ if ((usr == BCAST_PROTOCOL) && (dnode != tipc_own_addr(net))) n = tipc_node_find(net, dnode); else n = tipc_node_find(net, msg_prevnode(hdr)); if (!n) { kfree_skb(skb); return; } be = &n->bc_entry; le = &n->links[bearer_id]; rc = tipc_bcast_rcv(net, be->link, skb); /* Broadcast ACKs are sent on a unicast link */ if (rc & TIPC_LINK_SND_STATE) { tipc_node_read_lock(n); tipc_link_build_state_msg(le->link, &xmitq); tipc_node_read_unlock(n); } if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */ if (!skb_queue_empty(&n->bc_entry.namedq)) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); /* If reassembly or retransmission failure => reset all links to peer */ if (rc & TIPC_LINK_DOWN_EVT) tipc_node_reset_links(n); tipc_node_put(n); } /** * tipc_node_check_state - check and if necessary update node state * @n: target tipc_node * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet * @xmitq: queue for messages to be xmited on * Return: true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; struct tipc_link *l, *tnl, *pl = NULL; struct tipc_media_addr *maddr; int pb_id; if (trace_tipc_node_check_state_enabled()) { trace_tipc_skb_dump(skb, false, "skb for node state check"); trace_tipc_node_check_state(n, true, " "); } l = n->links[bearer_id].link; if (!l) return false; rcv_nxt = tipc_link_rcv_nxt(l); if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) return true; /* Find parallel link, if any */ for (pb_id = 0; pb_id < MAX_BEARERS; pb_id++) { if ((pb_id != bearer_id) && n->links[pb_id].link) { pl = n->links[pb_id].link; break; } } if (!tipc_link_validate_msg(l, hdr)) { trace_tipc_skb_dump(skb, false, "PROTO invalid (2)!"); trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (2)!"); return false; } /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { if (!tipc_link_is_up(l)) return true; if (!msg_peer_link_is_up(hdr)) return true; tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); } if (state == SELF_DOWN_PEER_LEAVING) { if (msg_peer_node_is_up(hdr)) return false; tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); return true; } if (state == SELF_LEAVING_PEER_DOWN) return false; /* Ignore duplicate packets */ if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt)) return true; /* Initiate or update failover mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { syncpt = oseqno + exp_pkts - 1; if (pl && !tipc_link_is_reset(pl)) { __tipc_node_link_down(n, &pb_id, xmitq, &maddr); trace_tipc_node_link_down(n, true, "node link down <- failover!"); tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } /* If parallel link was already down, and this happened before * the tunnel link came up, node failover was never started. * Ensure that a FAILOVER_MSG is sent to get peer out of * NODE_FAILINGOVER state, also this node must accept * TUNNEL_MSGs from peer. */ if (n->state != NODE_FAILINGOVER) tipc_node_link_failover(n, pl, l, xmitq); /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; } /* Open parallel link when tunnel link reaches synch point */ if ((n->state == NODE_FAILINGOVER) && tipc_link_is_up(l)) { if (!more(rcv_nxt, n->sync_point)) return true; tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT); if (pl) tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT); return true; } /* No syncing needed if only one link */ if (!pl || !tipc_link_is_up(pl)) return true; /* Initiate synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { if (n->capabilities & TIPC_TUNNEL_ENHANCED) syncpt = msg_syncpt(hdr); else syncpt = msg_seqno(msg_inner_hdr(hdr)) + exp_pkts - 1; if (!tipc_link_is_up(l)) __tipc_node_link_up(n, bearer_id, xmitq); if (n->state == SELF_UP_PEER_UP) { n->sync_point = syncpt; tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT); } } /* Open tunnel link when parallel link reaches synch point */ if (n->state == NODE_SYNCHING) { if (tipc_link_is_synching(l)) { tnl = l; } else { tnl = pl; pl = l; } inputq_len = skb_queue_len(tipc_link_inputq(pl)); dlv_nxt = tipc_link_rcv_nxt(pl) - inputq_len; if (more(dlv_nxt, n->sync_point)) { tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); return true; } if (l == pl) return true; if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) return true; if (usr == LINK_PROTOCOL) return true; return false; } return true; } /** * tipc_rcv - process TIPC packets/messages arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @b: pointer to bearer message arrived on * * Invoked with no locks held. Bearer pointer must point to a valid bearer * structure (i.e. cannot be NULL), but bearer can be inactive. */ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; struct tipc_link_entry *le; struct tipc_msg *hdr; struct tipc_node *n; int bearer_id = b->identity; u32 self = tipc_own_addr(net); int usr, rc = 0; u16 bc_ack; #ifdef CONFIG_TIPC_CRYPTO struct tipc_ehdr *ehdr; /* Check if message must be decrypted first */ if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb)) goto rcv; ehdr = (struct tipc_ehdr *)skb->data; if (likely(ehdr->user != LINK_CONFIG)) { n = tipc_node_find(net, ntohl(ehdr->addr)); if (unlikely(!n)) goto discard; } else { n = tipc_node_find_by_id(net, ehdr->id); } skb_dst_force(skb); tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); if (!skb) return; rcv: #endif /* Ensure message is well-formed before touching the header */ if (unlikely(!tipc_msg_validate(&skb))) goto discard; __skb_queue_head_init(&xmitq); hdr = buf_msg(skb); usr = msg_user(hdr); bc_ack = msg_bcast_ack(hdr); /* Handle arrival of discovery or broadcast packet */ if (unlikely(msg_non_seq(hdr))) { if (unlikely(usr == LINK_CONFIG)) return tipc_disc_rcv(net, skb, b); else return tipc_node_bc_rcv(net, skb, bearer_id); } /* Discard unicast link messages destined for another node */ if (unlikely(!msg_short(hdr) && (msg_destnode(hdr) != self))) goto discard; /* Locate neighboring node that sent packet */ n = tipc_node_find(net, msg_prevnode(hdr)); if (unlikely(!n)) goto discard; le = &n->links[bearer_id]; /* Ensure broadcast reception is in synch with peer's send state */ if (unlikely(usr == LINK_PROTOCOL)) { if (unlikely(skb_linearize(skb))) { tipc_node_put(n); goto discard; } hdr = buf_msg(skb); tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq); } else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) { tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr); } /* Receive packet directly if conditions permit */ tipc_node_read_lock(n); if (likely((n->state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) { spin_lock_bh(&le->lock); if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } spin_unlock_bh(&le->lock); } tipc_node_read_unlock(n); /* Check/update node state before receiving */ if (unlikely(skb)) { if (unlikely(skb_linearize(skb))) goto out_node_put; tipc_node_write_lock(n); if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) { if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } } tipc_node_write_unlock(n); } if (unlikely(rc & TIPC_LINK_UP_EVT)) tipc_node_link_up(n, bearer_id, &xmitq); if (unlikely(rc & TIPC_LINK_DOWN_EVT)) tipc_node_link_down(n, bearer_id, false); if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1))) tipc_node_mcast_rcv(n); if (!skb_queue_empty(&le->inputq)) tipc_sk_rcv(net, &le->inputq); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); out_node_put: tipc_node_put(n); discard: kfree_skb(skb); } void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop) { struct tipc_net *tn = tipc_net(net); int bearer_id = b->identity; struct sk_buff_head xmitq; struct tipc_link_entry *e; struct tipc_node *n; __skb_queue_head_init(&xmitq); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_write_lock(n); e = &n->links[bearer_id]; if (e->link) { if (prop == TIPC_NLA_PROP_TOL) tipc_link_set_tolerance(e->link, b->tolerance, &xmitq); else if (prop == TIPC_NLA_PROP_MTU) tipc_link_set_mtu(e->link, b->mtu); /* Update MTU for node link entry */ e->mtu = tipc_link_mss(e->link); } tipc_node_write_unlock(n); tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } rcu_read_unlock(); } int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct tipc_node *peer, *temp_node; u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; u32 addr; int err; /* We identify the peer by its net */ if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* attrs[TIPC_NLA_NET_NODEID] and attrs[TIPC_NLA_NET_ADDR] are * mutually exclusive cases */ if (attrs[TIPC_NLA_NET_ADDR]) { addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; } if (attrs[TIPC_NLA_NET_NODEID]) { if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); addr = hash128to32(node_id); } if (in_own_node(net, addr)) return -ENOTSUPP; spin_lock_bh(&tn->node_list_lock); peer = tipc_node_find(net, addr); if (!peer) { spin_unlock_bh(&tn->node_list_lock); return -ENXIO; } tipc_node_write_lock(peer); if (peer->state != SELF_DOWN_PEER_DOWN && peer->state != SELF_DOWN_PEER_LEAVING) { tipc_node_write_unlock(peer); err = -EBUSY; goto err_out; } tipc_node_clear_links(peer); tipc_node_write_unlock(peer); tipc_node_delete(peer); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); spin_unlock_bh(&tn->node_list_lock); return err; } int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); int done = cb->args[0]; int last_addr = cb->args[1]; struct tipc_node *node; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (last_addr) { node = tipc_node_find(net, last_addr); if (!node) { rcu_read_unlock(); /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the NLMSG_DONE message having * the NLM_F_DUMP_INTR flag set if the node state * changed while we released the lock. */ cb->prev_seq = 1; return -EPIPE; } tipc_node_put(node); } list_for_each_entry_rcu(node, &tn->node_list, list) { if (node->preliminary) continue; if (last_addr) { if (node->addr == last_addr) last_addr = 0; else continue; } tipc_node_read_lock(node); err = __tipc_nl_add_node(&msg, node); if (err) { last_addr = node->addr; tipc_node_read_unlock(node); goto out; } tipc_node_read_unlock(node); } done = 1; out: cb->args[0] = done; cb->args[1] = last_addr; rcu_read_unlock(); return skb->len; } /* tipc_node_find_by_name - locate owner node of link by link's name * @net: the applicable net namespace * @name: pointer to link name string * @bearer_id: pointer to index in 'node->links' array where the link was found. * * Returns pointer to node owning the link, or 0 if no matching link is found. */ static struct tipc_node *tipc_node_find_by_name(struct net *net, const char *link_name, unsigned int *bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l; struct tipc_node *n; struct tipc_node *found_node = NULL; int i; *bearer_id = 0; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_read_lock(n); for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l && !strcmp(tipc_link_name(l), link_name)) { *bearer_id = i; found_node = n; break; } } tipc_node_read_unlock(n); if (found_node) break; } rcu_read_unlock(); return found_node; } int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) { int err; int res = 0; int bearer_id; char *name; struct tipc_link *link; struct tipc_node *node; struct sk_buff_head xmitq; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); __skb_queue_head_init(&xmitq); if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); if (strcmp(name, tipc_bclink_name) == 0) return tipc_nl_bc_link_set(net, attrs); node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) return -EINVAL; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { res = -EINVAL; goto out; } if (attrs[TIPC_NLA_LINK_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) { res = err; goto out; } if (props[TIPC_NLA_PROP_TOL]) { u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); tipc_link_set_tolerance(link, tol, &xmitq); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { u32 max_win; max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); tipc_link_set_queue_limits(link, tipc_link_min_win(link), max_win); } } out: tipc_node_read_unlock(node); tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr, NULL); return res; } int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct tipc_nl_msg msg; char *name; int err; msg.portid = info->snd_portid; msg.seq = info->snd_seq; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; if (strcmp(name, tipc_bclink_name) == 0) { err = tipc_nl_add_bc_link(net, &msg, tipc_net(net)->bcl); if (err) goto err_free; } else { int bearer_id; struct tipc_node *node; struct tipc_link *link; node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) { err = -EINVAL; goto err_free; } tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { tipc_node_read_unlock(node); err = -EINVAL; goto err_free; } err = __tipc_nl_add_link(net, &msg, link, 0); tipc_node_read_unlock(node); if (err) goto err_free; } return genlmsg_reply(msg.skb, info); err_free: nlmsg_free(msg.skb); return err; } int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) { int err; char *link_name; unsigned int bearer_id; struct tipc_link *link; struct tipc_node *node; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_link_entry *le; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); err = -EINVAL; if (!strcmp(link_name, tipc_bclink_name)) { err = tipc_bclink_reset_stats(net, tipc_bc_sndlink(net)); if (err) return err; return 0; } else if (strstr(link_name, tipc_bclink_name)) { rcu_read_lock(); list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); link = node->bc_entry.link; if (link && !strcmp(link_name, tipc_link_name(link))) { err = tipc_bclink_reset_stats(net, link); tipc_node_read_unlock(node); break; } tipc_node_read_unlock(node); } rcu_read_unlock(); return err; } node = tipc_node_find_by_name(net, link_name, &bearer_id); if (!node) return -EINVAL; le = &node->links[bearer_id]; tipc_node_read_lock(node); spin_lock_bh(&le->lock); link = node->links[bearer_id].link; if (!link) { spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return -EINVAL; } tipc_link_reset_stats(link); spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return 0; } /* Caller should hold node lock */ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, struct tipc_node *node, u32 *prev_link, bool bc_link) { u32 i; int err; for (i = *prev_link; i < MAX_BEARERS; i++) { *prev_link = i; if (!node->links[i].link) continue; err = __tipc_nl_add_link(net, msg, node->links[i].link, NLM_F_MULTI); if (err) return err; } if (bc_link) { *prev_link = i; err = tipc_nl_add_bc_link(net, msg, node->bc_entry.link); if (err) return err; } *prev_link = 0; return 0; } int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; struct tipc_nl_msg msg; u32 prev_node = cb->args[0]; u32 prev_link = cb->args[1]; int done = cb->args[2]; bool bc_link = cb->args[3]; int err; if (done) return 0; if (!prev_node) { /* Check if broadcast-receiver links dumping is needed */ if (attrs && attrs[TIPC_NLA_LINK]) { err = nla_parse_nested_deprecated(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], tipc_nl_link_policy, NULL); if (unlikely(err)) return err; if (unlikely(!link[TIPC_NLA_LINK_BROADCAST])) return -EINVAL; bc_link = true; } } msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (prev_node) { node = tipc_node_find(net, prev_node); if (!node) { /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the last NLMSG_DONE message * having the NLM_F_DUMP_INTR flag set. */ cb->prev_seq = 1; goto out; } tipc_node_put(node); list_for_each_entry_continue_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } else { err = tipc_nl_add_bc_link(net, &msg, tn->bcl); if (err) goto out; list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } done = 1; out: rcu_read_unlock(); cb->args[0] = prev_node; cb->args[1] = prev_link; cb->args[2] = done; cb->args[3] = bc_link; return skb->len; } int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_MON_MAX + 1]; struct net *net = sock_net(skb->sk); int err; if (!info->attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MON_MAX, info->attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, info->extack); if (err) return err; if (attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]); err = tipc_nl_monitor_set_threshold(net, val); if (err) return err; } return 0; } static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg) { struct nlattr *attrs; void *hdr; u32 val; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; val = tipc_nl_monitor_get_threshold(net); if (nla_put_u32(msg->skb, TIPC_NLA_MON_ACTIVATION_THRESHOLD, val)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; int err; msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_add_monitor_prop(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); } int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_bearer = cb->args[0]; struct tipc_nl_msg msg; int bearer_id; int err; if (prev_bearer == MAX_BEARERS) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) { err = __tipc_nl_add_monitor(net, &msg, bearer_id); if (err) break; } rtnl_unlock(); cb->args[0] = bearer_id; return skb->len; } int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_node = cb->args[1]; u32 bearer_id = cb->args[2]; int done = cb->args[0]; struct tipc_nl_msg msg; int err; if (!prev_node) { struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; if (!attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(mon, TIPC_NLA_MON_MAX, attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, NULL); if (err) return err; if (!mon[TIPC_NLA_MON_REF]) return -EINVAL; bearer_id = nla_get_u32(mon[TIPC_NLA_MON_REF]); if (bearer_id >= MAX_BEARERS) return -EINVAL; } if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); err = tipc_nl_add_monitor_peer(net, &msg, bearer_id, &prev_node); if (!err) done = 1; rtnl_unlock(); cb->args[0] = done; cb->args[1] = prev_node; cb->args[2] = bearer_id; return skb->len; } #ifdef CONFIG_TIPC_CRYPTO static int tipc_nl_retrieve_key(struct nlattr **attrs, struct tipc_aead_key **pkey) { struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; struct tipc_aead_key *key; if (!attr) return -ENODATA; if (nla_len(attr) < sizeof(*key)) return -EINVAL; key = (struct tipc_aead_key *)nla_data(attr); if (key->keylen > TIPC_AEAD_KEYLEN_MAX || nla_len(attr) < tipc_aead_key_size(key)) return -EINVAL; *pkey = key; return 0; } static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) { struct nlattr *attr = attrs[TIPC_NLA_NODE_ID]; if (!attr) return -ENODATA; if (nla_len(attr) < TIPC_NODEID_LEN) return -EINVAL; *node_id = (u8 *)nla_data(attr); return 0; } static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv) { struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING]; if (!attr) return -ENODATA; *intv = nla_get_u32(attr); return 0; } static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; bool rekeying = true, master_key = false; u8 *id, *own_id, mode; u32 intv = 0; int rc = 0; if (!info->attrs[TIPC_NLA_NODE]) return -EINVAL; rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX, info->attrs[TIPC_NLA_NODE], tipc_nl_node_policy, info->extack); if (rc) return rc; own_id = tipc_own_id(net); if (!own_id) { GENL_SET_ERR_MSG(info, "not found own node identity (set id?)"); return -EPERM; } rc = tipc_nl_retrieve_rekeying(attrs, &intv); if (rc == -ENODATA) rekeying = false; rc = tipc_nl_retrieve_key(attrs, &ukey); if (rc == -ENODATA && rekeying) goto rekeying; else if (rc) return rc; rc = tipc_aead_key_validate(ukey, info); if (rc) return rc; rc = tipc_nl_retrieve_nodeid(attrs, &id); switch (rc) { case -ENODATA: mode = CLUSTER_KEY; master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]); break; case 0: mode = PER_NODE_KEY; if (memcmp(id, own_id, NODE_ID_LEN)) { n = tipc_node_find_by_id(net, id) ?: tipc_node_create(net, 0, id, 0xffffu, 0, true); if (unlikely(!n)) return -ENOMEM; c = n->crypto_rx; } break; default: return rc; } /* Initiate the TX/RX key */ rc = tipc_crypto_key_init(c, ukey, mode, master_key); if (n) tipc_node_put(n); if (unlikely(rc < 0)) { GENL_SET_ERR_MSG(info, "unable to initiate or attach new key"); return rc; } else if (c == tx) { /* Distribute TX key but not master one */ if (!master_key && tipc_crypto_key_distr(tx, rc, NULL)) GENL_SET_ERR_MSG(info, "failed to replicate new key"); rekeying: /* Schedule TX rekeying if needed */ tipc_crypto_rekeying_sched(tx, rekeying, intv); } return 0; } int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_set_key(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_node *n; tipc_crypto_key_flush(tn->crypto_tx); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) tipc_crypto_key_flush(n->crypto_rx); rcu_read_unlock(); return 0; } int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_flush_key(skb, info); rtnl_unlock(); return err; } #endif /** * tipc_node_dump - dump TIPC node data * @n: tipc node to be dumped * @more: dump more? * - false: dump only tipc node data * - true: dump node link data as well * @buf: returned buffer of dump data in format */ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) { int i = 0; size_t sz = (more) ? NODE_LMAX : NODE_LMIN; if (!n) { i += scnprintf(buf, sz, "node data: (null)\n"); return i; } i += scnprintf(buf, sz, "node data: %x", n->addr); i += scnprintf(buf + i, sz - i, " %x", n->state); i += scnprintf(buf + i, sz - i, " %d", n->active_links[0]); i += scnprintf(buf + i, sz - i, " %d", n->active_links[1]); i += scnprintf(buf + i, sz - i, " %x", n->action_flags); i += scnprintf(buf + i, sz - i, " %u", n->failover_sent); i += scnprintf(buf + i, sz - i, " %u", n->sync_point); i += scnprintf(buf + i, sz - i, " %d", n->link_cnt); i += scnprintf(buf + i, sz - i, " %u", n->working_links); i += scnprintf(buf + i, sz - i, " %x", n->capabilities); i += scnprintf(buf + i, sz - i, " %lu\n", n->keepalive_intv); if (!more) return i; i += scnprintf(buf + i, sz - i, "link_entry[0]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[0].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[0].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[0].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[0].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "link_entry[1]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[1].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[1].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[1].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[1].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "bclink:\n "); i += tipc_link_dump(n->bc_entry.link, TIPC_DUMP_NONE, buf + i); return i; } void tipc_node_pre_cleanup_net(struct net *exit_net) { struct tipc_node *n; struct tipc_net *tn; struct net *tmp; rcu_read_lock(); for_each_net_rcu(tmp) { if (tmp == exit_net) continue; tn = tipc_net(tmp); if (!tn) continue; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_rcu(n, &tn->node_list, list) { if (!n->peer_net) continue; if (n->peer_net != exit_net) continue; tipc_node_write_lock(n); n->peer_net = NULL; n->peer_hash_mix = 0; tipc_node_write_unlock_fast(n); break; } spin_unlock_bh(&tn->node_list_lock); } rcu_read_unlock(); }
109 20 315 388 93 156 107 94 94 94 93 205 203 205 205 119 156 185 107 205 204 40 203 203 54 94 94 94 94 94 38 94 97 94 93 94 94 93 94 94 94 94 94 40 94 94 94 66 94 16 16 38 38 38 38 38 15 15 15 15 15 15 15 61 61 61 61 61 229 229 229 20 226 314 274 303 207 61 302 302 313 314 313 315 315 315 94 94 315 94 304 15 15 303 61 302 108 96 17 2 125 15 311 312 214 311 312 312 239 11 127 221 10 3 224 125 105 75 75 75 125 125 125 205 125 6 119 108 118 119 5 123 6 4 1 315 2 312 136 181 7 7 101 1 96 2 6 4 3 6 94 95 1 111 109 104 184 4 782 786 785 978 981 171 976 983 580 974 973 378 453 334 411 322 393 33 33 168 836 792 796 793 11 781 785 784 125 127 45 107 125 126 39 5 234 149 136 3 132 129 126 124 2 127 126 1 126 127 127 126 64 115 34 34 34 34 34 33 34 34 34 34 34 34 18 12 16 16 16 15 16 16 15 15 15 15 15 15 15 15 15 15 15 15 201 202 199 202 202 200 199 202 202 200 202 202 102 202 202 202 199 20 201 15 18 18 1 16 18 18 18 17 18 18 100 99 53 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet * & Swedish University of Agricultural Sciences. * * Jens Laas <jens.laas@data.slu.se> Swedish University of * Agricultural Sciences. * * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet * * This work is based on the LPC-trie which is originally described in: * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. * https://www.csc.kth.se/~snilsson/software/dyntrie2/ * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 * * Code from fib_hash has been reused which includes the following header: * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 FIB: lookup engine and maintenance routines. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Substantial contributions to this work comes from: * * David S. Miller, <davem@davemloft.net> * Stephen Hemminger <shemminger@osdl.org> * Paul E. McKenney <paulmck@us.ibm.com> * Patrick McHardy <kaber@trash.net> */ #include <linux/cache.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/rcupdate_wait.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/notifier.h> #include <net/net_namespace.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/fib_notifier.h> #include <trace/events/fib.h> #include "fib_lookup.h" static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_alias *fa, struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, .dscp = fa->fa_dscp, .type = fa->fa_type, .tb_id = fa->tb_id, }; return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_alias *fa, struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, .dscp = fa->fa_dscp, .type = fa->fa_type, .tb_id = fa->tb_id, }; return call_fib4_notifiers(net, event_type, &info.info); } #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) #define KEY_MAX ((t_key)~0) typedef unsigned int t_key; #define IS_TRIE(n) ((n)->pos >= KEYLENGTH) #define IS_TNODE(n) ((n)->bits) #define IS_LEAF(n) (!(n)->bits) struct key_vector { t_key key; unsigned char pos; /* 2log(KEYLENGTH) bits needed */ unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char slen; union { /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ struct hlist_head leaf; /* This array is valid if (pos | bits) > 0 (TNODE) */ DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode); }; }; struct tnode { struct rcu_head rcu; t_key empty_children; /* KEYLENGTH bits needed */ t_key full_children; /* KEYLENGTH bits needed */ struct key_vector __rcu *parent; struct key_vector kv[1]; #define tn_bits kv[0].bits }; #define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n]) #define LEAF_SIZE TNODE_SIZE(1) #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats { unsigned int gets; unsigned int backtrack; unsigned int semantic_match_passed; unsigned int semantic_match_miss; unsigned int null_node_hit; unsigned int resize_node_skipped; }; #endif struct trie_stat { unsigned int totdepth; unsigned int maxdepth; unsigned int tnodes; unsigned int leaves; unsigned int nullpointers; unsigned int prefixes; unsigned int nodesizes[MAX_STAT_DEPTH]; }; struct trie { struct key_vector kv[1]; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats; #endif }; static struct key_vector *resize(struct trie *t, struct key_vector *tn); static unsigned int tnode_free_size; /* * synchronize_rcu after call_rcu for outstanding dirty memory; it should be * especially useful before resizing the root node with PREEMPT_NONE configs; * the value was obtained experimentally, aiming to avoid visible slowdown. */ unsigned int sysctl_fib_sync_mem = 512 * 1024; unsigned int sysctl_fib_sync_mem_min = 64 * 1024; unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024; static struct kmem_cache *fn_alias_kmem __ro_after_init; static struct kmem_cache *trie_leaf_kmem __ro_after_init; static inline struct tnode *tn_info(struct key_vector *kv) { return container_of(kv, struct tnode, kv[0]); } /* caller must hold RTNL */ #define node_parent(tn) rtnl_dereference(tn_info(tn)->parent) #define get_child(tn, i) rtnl_dereference((tn)->tnode[i]) /* caller must hold RCU read lock or RTNL */ #define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent) #define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i]) /* wrapper for rcu_assign_pointer */ static inline void node_set_parent(struct key_vector *n, struct key_vector *tp) { if (n) rcu_assign_pointer(tn_info(n)->parent, tp); } #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p) /* This provides us with the number of children in this node, in the case of a * leaf this will return 0 meaning none of the children are accessible. */ static inline unsigned long child_length(const struct key_vector *tn) { return (1ul << tn->bits) & ~(1ul); } #define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos) static inline unsigned long get_index(t_key key, struct key_vector *kv) { unsigned long index = key ^ kv->key; if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos)) return 0; return index >> kv->pos; } /* To understand this stuff, an understanding of keys and all their bits is * necessary. Every node in the trie has a key associated with it, but not * all of the bits in that key are significant. * * Consider a node 'n' and its parent 'tp'. * * If n is a leaf, every bit in its key is significant. Its presence is * necessitated by path compression, since during a tree traversal (when * searching for a leaf - unless we are doing an insertion) we will completely * ignore all skipped bits we encounter. Thus we need to verify, at the end of * a potentially successful search, that we have indeed been walking the * correct key path. * * Note that we can never "miss" the correct key in the tree if present by * following the wrong path. Path compression ensures that segments of the key * that are the same for all keys with a given prefix are skipped, but the * skipped part *is* identical for each node in the subtrie below the skipped * bit! trie_insert() in this implementation takes care of that. * * if n is an internal node - a 'tnode' here, the various parts of its key * have many different meanings. * * Example: * _________________________________________________________________ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | * ----------------------------------------------------------------- * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 * * _________________________________________________________________ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | * ----------------------------------------------------------------- * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 * * tp->pos = 22 * tp->bits = 3 * n->pos = 13 * n->bits = 4 * * First, let's just ignore the bits that come before the parent tp, that is * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this * point we do not use them for anything. * * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the * index into the parent's child array. That is, they will be used to find * 'n' among tp's children. * * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits * for the node n. * * All the bits we have seen so far are significant to the node n. The rest * of the bits are really not needed or indeed known in n->key. * * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into * n's child array, and will of course be different for each child. * * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown * at this point. */ static const int halve_threshold = 25; static const int inflate_threshold = 50; static const int halve_threshold_root = 15; static const int inflate_threshold_root = 30; static inline void alias_free_mem_rcu(struct fib_alias *fa) { kfree_rcu(fa, rcu); } #define TNODE_VMALLOC_MAX \ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *)) static void __node_free_rcu(struct rcu_head *head) { struct tnode *n = container_of(head, struct tnode, rcu); if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); else kvfree(n); } #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) static struct tnode *tnode_alloc(int bits) { size_t size; /* verify bits is within bounds */ if (bits > TNODE_VMALLOC_MAX) return NULL; /* determine size and verify it is non-zero and didn't overflow */ size = TNODE_SIZE(1ul << bits); if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else return vzalloc(size); } static inline void empty_child_inc(struct key_vector *n) { tn_info(n)->empty_children++; if (!tn_info(n)->empty_children) tn_info(n)->full_children++; } static inline void empty_child_dec(struct key_vector *n) { if (!tn_info(n)->empty_children) tn_info(n)->full_children--; tn_info(n)->empty_children--; } static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { struct key_vector *l; struct tnode *kv; kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (!kv) return NULL; /* initialize key vector */ l = kv->kv; l->key = key; l->pos = 0; l->bits = 0; l->slen = fa->fa_slen; /* link leaf to fib alias */ INIT_HLIST_HEAD(&l->leaf); hlist_add_head(&fa->fa_list, &l->leaf); return l; } static struct key_vector *tnode_new(t_key key, int pos, int bits) { unsigned int shift = pos + bits; struct key_vector *tn; struct tnode *tnode; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); tnode = tnode_alloc(bits); if (!tnode) return NULL; pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), sizeof(struct key_vector *) << bits); if (bits == KEYLENGTH) tnode->full_children = 1; else tnode->empty_children = 1ul << bits; tn = tnode->kv; tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; tn->pos = pos; tn->bits = bits; tn->slen = pos; return tn; } /* Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 */ static inline int tnode_full(struct key_vector *tn, struct key_vector *n) { return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); } /* Add a child at position i overwriting the old value. * Update the value of full_children and empty_children. */ static void put_child(struct key_vector *tn, unsigned long i, struct key_vector *n) { struct key_vector *chi = get_child(tn, i); int isfull, wasfull; BUG_ON(i >= child_length(tn)); /* update emptyChildren, overflow into fullChildren */ if (!n && chi) empty_child_inc(tn); if (n && !chi) empty_child_dec(tn); /* update fullChildren */ wasfull = tnode_full(tn, chi); isfull = tnode_full(tn, n); if (wasfull && !isfull) tn_info(tn)->full_children--; else if (!wasfull && isfull) tn_info(tn)->full_children++; if (n && (tn->slen < n->slen)) tn->slen = n->slen; rcu_assign_pointer(tn->tnode[i], n); } static void update_children(struct key_vector *tn) { unsigned long i; /* update all of the child parent pointers */ for (i = child_length(tn); i;) { struct key_vector *inode = get_child(tn, --i); if (!inode) continue; /* Either update the children of a tnode that * already belongs to us or update the child * to point to ourselves. */ if (node_parent(inode) == tn) update_children(inode); else node_set_parent(inode, tn); } } static inline void put_child_root(struct key_vector *tp, t_key key, struct key_vector *n) { if (IS_TRIE(tp)) rcu_assign_pointer(tp->tnode[0], n); else put_child(tp, get_index(key, tp), n); } static inline void tnode_free_init(struct key_vector *tn) { tn_info(tn)->rcu.next = NULL; } static inline void tnode_free_append(struct key_vector *tn, struct key_vector *n) { tn_info(n)->rcu.next = tn_info(tn)->rcu.next; tn_info(tn)->rcu.next = &tn_info(n)->rcu; } static void tnode_free(struct key_vector *tn) { struct callback_head *head = &tn_info(tn)->rcu; while (head) { head = head->next; tnode_free_size += TNODE_SIZE(1ul << tn->bits); node_free(tn); tn = container_of(head, struct tnode, rcu)->kv; } if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) { tnode_free_size = 0; synchronize_net(); } } static struct key_vector *replace(struct trie *t, struct key_vector *oldtnode, struct key_vector *tn) { struct key_vector *tp = node_parent(oldtnode); unsigned long i; /* setup the parent pointer out of and back into this node */ NODE_INIT_PARENT(tn, tp); put_child_root(tp, tn->key, tn); /* update all of the child parent pointers */ update_children(tn); /* all pointers should be clean so we are done */ tnode_free(oldtnode); /* resize children now that oldtnode is freed */ for (i = child_length(tn); i;) { struct key_vector *inode = get_child(tn, --i); /* resize child node */ if (tnode_full(tn, inode)) tn = resize(t, inode); } return tp; } static struct key_vector *inflate(struct trie *t, struct key_vector *oldtnode) { struct key_vector *tn; unsigned long i; t_key m; pr_debug("In inflate\n"); tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); if (!tn) goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); /* Assemble all of the pointers in our cluster, in this case that * represents all of the pointers out of our allocated nodes that * point to existing tnodes and the links between our allocated * nodes. */ for (i = child_length(oldtnode), m = 1u << tn->pos; i;) { struct key_vector *inode = get_child(oldtnode, --i); struct key_vector *node0, *node1; unsigned long j, k; /* An empty child */ if (!inode) continue; /* A leaf or an internal node with skipped bits */ if (!tnode_full(oldtnode, inode)) { put_child(tn, get_index(inode->key, tn), inode); continue; } /* drop the node in the old tnode free list */ tnode_free_append(oldtnode, inode); /* An internal node with two children */ if (inode->bits == 1) { put_child(tn, 2 * i + 1, get_child(inode, 1)); put_child(tn, 2 * i, get_child(inode, 0)); continue; } /* We will replace this node 'inode' with two new * ones, 'node0' and 'node1', each with half of the * original children. The two new nodes will have * a position one bit further down the key and this * means that the "significant" part of their keys * (see the discussion near the top of this file) * will differ by one bit, which will be "0" in * node0's key and "1" in node1's key. Since we are * moving the key position by one step, the bit that * we are moving away from - the bit at position * (tn->pos) - is the one that will differ between * node0 and node1. So... we synthesize that bit in the * two new keys. */ node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1); if (!node1) goto nomem; node0 = tnode_new(inode->key, inode->pos, inode->bits - 1); tnode_free_append(tn, node1); if (!node0) goto nomem; tnode_free_append(tn, node0); /* populate child pointers in new nodes */ for (k = child_length(inode), j = k / 2; j;) { put_child(node1, --j, get_child(inode, --k)); put_child(node0, j, get_child(inode, j)); put_child(node1, --j, get_child(inode, --k)); put_child(node0, j, get_child(inode, j)); } /* link new nodes to parent */ NODE_INIT_PARENT(node1, tn); NODE_INIT_PARENT(node0, tn); /* link parent to nodes */ put_child(tn, 2 * i + 1, node1); put_child(tn, 2 * i, node0); } /* setup the parent pointers into and out of this node */ return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); notnode: return NULL; } static struct key_vector *halve(struct trie *t, struct key_vector *oldtnode) { struct key_vector *tn; unsigned long i; pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); if (!tn) goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); /* Assemble all of the pointers in our cluster, in this case that * represents all of the pointers out of our allocated nodes that * point to existing tnodes and the links between our allocated * nodes. */ for (i = child_length(oldtnode); i;) { struct key_vector *node1 = get_child(oldtnode, --i); struct key_vector *node0 = get_child(oldtnode, --i); struct key_vector *inode; /* At least one of the children is empty */ if (!node1 || !node0) { put_child(tn, i / 2, node1 ? : node0); continue; } /* Two nonempty children */ inode = tnode_new(node0->key, oldtnode->pos, 1); if (!inode) goto nomem; tnode_free_append(tn, inode); /* initialize pointers out of node */ put_child(inode, 1, node1); put_child(inode, 0, node0); NODE_INIT_PARENT(inode, tn); /* link parent to node */ put_child(tn, i / 2, inode); } /* setup the parent pointers into and out of this node */ return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); notnode: return NULL; } static struct key_vector *collapse(struct trie *t, struct key_vector *oldtnode) { struct key_vector *n, *tp; unsigned long i; /* scan the tnode looking for that one child that might still exist */ for (n = NULL, i = child_length(oldtnode); !n && i;) n = get_child(oldtnode, --i); /* compress one level */ tp = node_parent(oldtnode); put_child_root(tp, oldtnode->key, n); node_set_parent(n, tp); /* drop dead node */ node_free(oldtnode); return tp; } static unsigned char update_suffix(struct key_vector *tn) { unsigned char slen = tn->pos; unsigned long stride, i; unsigned char slen_max; /* only vector 0 can have a suffix length greater than or equal to * tn->pos + tn->bits, the second highest node will have a suffix * length at most of tn->pos + tn->bits - 1 */ slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen); /* search though the list of children looking for nodes that might * have a suffix greater than the one we currently have. This is * why we start with a stride of 2 since a stride of 1 would * represent the nodes with suffix length equal to tn->pos */ for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) { struct key_vector *n = get_child(tn, i); if (!n || (n->slen <= slen)) continue; /* update stride and slen based on new value */ stride <<= (n->slen - slen); slen = n->slen; i &= ~(stride - 1); /* stop searching if we have hit the maximum possible value */ if (slen >= slen_max) break; } tn->slen = slen; return slen; } /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of * the Helsinki University of Technology and Matti Tikkanen of Nokia * Telecommunications, page 6: * "A node is doubled if the ratio of non-empty children to all * children in the *doubled* node is at least 'high'." * * 'high' in this instance is the variable 'inflate_threshold'. It * is expressed as a percentage, so we multiply it with * child_length() and instead of multiplying by 2 (since the * child array will be doubled by inflate()) and multiplying * the left-hand side by 100 (to handle the percentage thing) we * multiply the left-hand side by 50. * * The left-hand side may look a bit weird: child_length(tn) * - tn->empty_children is of course the number of non-null children * in the current node. tn->full_children is the number of "full" * children, that is non-null tnodes with a skip value of 0. * All of those will be doubled in the resulting inflated tnode, so * we just count them one extra time here. * * A clearer way to write this would be: * * to_be_doubled = tn->full_children; * not_to_be_doubled = child_length(tn) - tn->empty_children - * tn->full_children; * * new_child_length = child_length(tn) * 2; * * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / * new_child_length; * if (new_fill_factor >= inflate_threshold) * * ...and so on, tho it would mess up the while () loop. * * anyway, * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= * inflate_threshold * * avoid a division: * 100 * (not_to_be_doubled + 2*to_be_doubled) >= * inflate_threshold * new_child_length * * expand not_to_be_doubled and to_be_doubled, and shorten: * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= * inflate_threshold * child_length(tn) * 2 * * shorten again: * 50 * (tn->full_children + child_length(tn) - * tn->empty_children) >= inflate_threshold * * child_length(tn) * */ static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn) { unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold; used -= tn_info(tn)->empty_children; used += tn_info(tn)->full_children; /* if bits == KEYLENGTH then pos = 0, and will fail below */ return (used > 1) && tn->pos && ((50 * used) >= threshold); } static inline bool should_halve(struct key_vector *tp, struct key_vector *tn) { unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold; used -= tn_info(tn)->empty_children; /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold); } static inline bool should_collapse(struct key_vector *tn) { unsigned long used = child_length(tn); used -= tn_info(tn)->empty_children; /* account for bits == KEYLENGTH case */ if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children) used -= KEY_MAX; /* One child or none, time to drop us from the trie */ return used < 2; } #define MAX_WORK 10 static struct key_vector *resize(struct trie *t, struct key_vector *tn) { #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif struct key_vector *tp = node_parent(tn); unsigned long cindex = get_index(tn->key, tp); int max_work = MAX_WORK; pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", tn, inflate_threshold, halve_threshold); /* track the tnode via the pointer from the parent instead of * doing it ourselves. This way we can let RCU fully do its * thing without us interfering */ BUG_ON(tn != get_child(tp, cindex)); /* Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. */ while (should_inflate(tp, tn) && max_work) { tp = inflate(t, tn); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; tn = get_child(tp, cindex); } /* update parent in case inflate failed */ tp = node_parent(tn); /* Return if at least one inflate is run */ if (max_work != MAX_WORK) return tp; /* Halve as long as the number of empty children in this * node is above threshold. */ while (should_halve(tp, tn) && max_work) { tp = halve(t, tn); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; tn = get_child(tp, cindex); } /* Only one child remains */ if (should_collapse(tn)) return collapse(t, tn); /* update parent in case halve failed */ return node_parent(tn); } static void node_pull_suffix(struct key_vector *tn, unsigned char slen) { unsigned char node_slen = tn->slen; while ((node_slen > tn->pos) && (node_slen > slen)) { slen = update_suffix(tn); if (node_slen == slen) break; tn = node_parent(tn); node_slen = tn->slen; } } static void node_push_suffix(struct key_vector *tn, unsigned char slen) { while (tn->slen < slen) { tn->slen = slen; tn = node_parent(tn); } } /* rcu_read_lock needs to be hold by caller from readside */ static struct key_vector *fib_find_node(struct trie *t, struct key_vector **tp, u32 key) { struct key_vector *pn, *n = t->kv; unsigned long index = 0; do { pn = n; n = get_child_rcu(n, index); if (!n) break; index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the bits in the cindex. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex * * This check is safe even if bits == KEYLENGTH due to the * fact that we can only allocate a node with 32 bits if a * long is greater than 32 bits. */ if (index >= (1ul << n->bits)) { n = NULL; break; } /* keep searching until we find a perfect match leaf or NULL */ } while (IS_TNODE(n)); *tp = pn; return n; } /* Return the first fib alias matching DSCP with * priority less than or equal to PRIO. * If 'find_first' is set, return the first matching * fib alias, regardless of DSCP and priority. */ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, dscp_t dscp, u32 prio, u32 tb_id, bool find_first) { struct fib_alias *fa; if (!fah) return NULL; hlist_for_each_entry(fa, fah, fa_list) { /* Avoid Sparse warning when using dscp_t in inequalities */ u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp); u8 __dscp = inet_dscp_to_dsfield(dscp); if (fa->fa_slen < slen) continue; if (fa->fa_slen != slen) break; if (fa->tb_id > tb_id) continue; if (fa->tb_id != tb_id) break; if (find_first) return fa; if (__fa_dscp > __dscp) continue; if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp) return fa; } return NULL; } static struct fib_alias * fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) { u8 slen = KEYLENGTH - fri->dst_len; struct key_vector *l, *tp; struct fib_table *tb; struct fib_alias *fa; struct trie *t; tb = fib_get_table(net, fri->tb_id); if (!tb) return NULL; t = (struct trie *)tb->tb_data; l = fib_find_node(t, &tp, be32_to_cpu(fri->dst)); if (!l) return NULL; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { if (fa->fa_slen == slen && fa->tb_id == fri->tb_id && fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi && fa->fa_type == fri->type) return fa; } return NULL; } void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) { u8 fib_notify_on_flag_change; struct fib_alias *fa_match; struct sk_buff *skb; int err; rcu_read_lock(); fa_match = fib_find_matching_alias(net, fri); if (!fa_match) goto out; /* These are paired with the WRITE_ONCE() happening in this function. * The reason is that we are only protected by RCU at this point. */ if (READ_ONCE(fa_match->offload) == fri->offload && READ_ONCE(fa_match->trap) == fri->trap && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload, fri->offload); WRITE_ONCE(fa_match->trap, fri->trap); fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change); /* 2 means send notifications only if offload_failed was changed. */ if (fib_notify_on_flag_change == 2 && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload_failed, fri->offload_failed); if (!fib_notify_on_flag_change) goto out; skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC); if (!skb) { err = -ENOBUFS; goto errout; } err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC); goto out; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err); out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set); static void trie_rebalance(struct trie *t, struct key_vector *tn) { while (!IS_TRIE(tn)) tn = resize(t, tn); } static int fib_insert_node(struct trie *t, struct key_vector *tp, struct fib_alias *new, t_key key) { struct key_vector *n, *l; l = leaf_new(key, new); if (!l) goto noleaf; /* retrieve child from parent node */ n = get_child(tp, get_index(key, tp)); /* Case 2: n is a LEAF or a TNODE and the key doesn't match. * * Add a new tnode here * first tnode need some special handling * leaves us in position for handling as case 3 */ if (n) { struct key_vector *tn; tn = tnode_new(key, __fls(key ^ n->key), 1); if (!tn) goto notnode; /* initialize routes out of node */ NODE_INIT_PARENT(tn, tp); put_child(tn, get_index(key, tn) ^ 1, n); /* start adding routes into the node */ put_child_root(tp, key, tn); node_set_parent(n, tn); /* parent now has a NULL spot where the leaf can go */ tp = tn; } /* Case 3: n is NULL, and will just insert a new leaf */ node_push_suffix(tp, new->fa_slen); NODE_INIT_PARENT(l, tp); put_child_root(tp, key, l); trie_rebalance(t, tp); return 0; notnode: node_free(l); noleaf: return -ENOMEM; } static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) { if (!l) return fib_insert_node(t, tp, new, key); if (fa) { hlist_add_before_rcu(&new->fa_list, &fa->fa_list); } else { struct fib_alias *last; hlist_for_each_entry(last, &l->leaf, fa_list) { if (new->fa_slen < last->fa_slen) break; if ((new->fa_slen == last->fa_slen) && (new->tb_id > last->tb_id)) break; fa = last; } if (fa) hlist_add_behind_rcu(&new->fa_list, &fa->fa_list); else hlist_add_head_rcu(&new->fa_list, &l->leaf); } /* if we added to the tail node then we need to update slen */ if (l->slen < new->fa_slen) { l->slen = new->fa_slen; node_push_suffix(tp, new->fa_slen); } return 0; } static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) { if (plen > KEYLENGTH) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); return false; } if ((plen < KEYLENGTH) && (key << plen)) { NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); return false; } return true; } static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; u16 nlflags = NLM_F_EXCL; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; dscp_t dscp; u32 key; int err; key = ntohl(cfg->fc_dst); if (!fib_valid_key_len(key, plen, extack)) return -EINVAL; pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); if (IS_ERR(fi)) { err = PTR_ERR(fi); goto err; } dscp = cfg->fc_dscp; l = fib_find_node(t, &tp, key); fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority, tb->tb_id, false) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,dscp,priority], if such key already * exists or to the node before which we will insert new one. * * If fa is NULL, we will need to allocate a new one and * insert to the tail of the section matching the suffix length * of the new alias. */ if (fa && fa->fa_dscp == dscp && fa->fa_info->fib_priority == fi->fib_priority) { struct fib_alias *fa_first, *fa_match; err = -EEXIST; if (cfg->fc_nlflags & NLM_F_EXCL) goto out; nlflags &= ~NLM_F_EXCL; /* We have 2 goals: * 1. Find exact match for type, scope, fib_info to avoid * duplicate routes * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it */ fa_match = NULL; fa_first = fa; hlist_for_each_entry_from(fa, fa_list) { if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || (fa->fa_dscp != dscp)) break; if (fa->fa_info->fib_priority != fi->fib_priority) break; if (fa->fa_type == cfg->fc_type && fa->fa_info == fi) { fa_match = fa; break; } } if (cfg->fc_nlflags & NLM_F_REPLACE) { struct fib_info *fi_drop; u8 state; nlflags |= NLM_F_REPLACE; fa = fa_first; if (fa_match) { if (fa == fa_match) err = 0; goto out; } err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; fi_drop = fa->fa_info; new_fa->fa_dscp = fa->fa_dscp; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; new_fa->offload_failed = 0; hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) { hlist_replace_rcu(&new_fa->fa_list, &fa->fa_list); goto out_free_new_fa; } } rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); alias_free_mem_rcu(fa); fib_release_info(fi_drop); if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); goto succeeded; } /* Error if we find a perfect match which * uses the same scope, type, and nexthop * information. */ if (fa_match) goto out; if (cfg->fc_nlflags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; else fa = fa_first; } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) goto out; nlflags |= NLM_F_CREATE; err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; new_fa->fa_info = fi; new_fa->fa_dscp = dscp; new_fa->fa_type = cfg->fc_type; new_fa->fa_state = 0; new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; new_fa->offload_failed = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) goto out_free_new_fa; /* The alias was already inserted, so the node must exist. */ l = l ? l : fib_find_node(t, &tp, key); if (WARN_ON_ONCE(!l)) { err = -ENOENT; goto out_free_new_fa; } if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) goto out_remove_new_fa; } if (!plen) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: return 0; out_remove_new_fa: fib_remove_alias(t, tp, l, new_fa); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: fib_release_info(fi); err: return err; } static inline t_key prefix_mismatch(t_key key, struct key_vector *n) { t_key prefix = n->key; return (key ^ prefix) & (prefix | -prefix); } bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, const struct flowi4 *flp) { if (nhc->nhc_flags & RTNH_F_DEAD) return false; if (ip_ignore_linkdown(nhc->nhc_dev) && nhc->nhc_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif) return false; return true; } /* should be called with rcu_read_lock */ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct trie *t = (struct trie *) tb->tb_data; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif const t_key key = ntohl(flp->daddr); struct key_vector *n, *pn; struct fib_alias *fa; unsigned long index; t_key cindex; pn = t->kv; cindex = 0; n = get_child_rcu(pn, cindex); if (!n) { trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->gets); #endif /* Step 1: Travel to the longest prefix match in the trie */ for (;;) { index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the "bits" in the prefix. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex * * This check is safe even if bits == KEYLENGTH due to the * fact that we can only allocate a node with 32 bits if a * long is greater than 32 bits. */ if (index >= (1ul << n->bits)) break; /* we have found a leaf. Prefixes have already been compared */ if (IS_LEAF(n)) goto found; /* only record pn and cindex if we are going to be chopping * bits later. Otherwise we are just wasting cycles. */ if (n->slen > n->pos) { pn = n; cindex = index; } n = get_child_rcu(n, index); if (unlikely(!n)) goto backtrace; } /* Step 2: Sort out leaves and begin backtracing for longest prefix */ for (;;) { /* record the pointer where our next node pointer is stored */ struct key_vector __rcu **cptr = n->tnode; /* This test verifies that none of the bits that differ * between the key and the prefix exist in the region of * the lsb and higher in the prefix. */ if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos)) goto backtrace; /* exit out and process leaf */ if (unlikely(IS_LEAF(n))) break; /* Don't bother recording parent info. Since we are in * prefix match mode we will have to come back to wherever * we started this traversal anyway */ while ((n = rcu_dereference(*cptr)) == NULL) { backtrace: #ifdef CONFIG_IP_FIB_TRIE_STATS if (!n) this_cpu_inc(stats->null_node_hit); #endif /* If we are at cindex 0 there are no more bits for * us to strip at this level so we must ascend back * up one level to see if there are any more bits to * be stripped there. */ while (!cindex) { t_key pkey = pn->key; /* If we don't have a parent then there is * nothing for us to do as we do not have any * further nodes to parse. */ if (IS_TRIE(pn)) { trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif /* Get Child's index */ pn = node_parent_rcu(pn); cindex = get_index(pkey, pn); } /* strip the least significant bit from the cindex */ cindex &= cindex - 1; /* grab pointer for next child node */ cptr = &pn->tnode[cindex]; } } found: /* this line carries forward the xor from earlier in the function */ index = key ^ n->key; /* Step 3: Process the leaf, if that fails fall back to backtracing */ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; struct fib_nh_common *nhc; int nhsel, err; if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { if (index >= (1ul << fa->fa_slen)) continue; } if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; /* Paired with WRITE_ONCE() in fib_release_info() */ if (READ_ONCE(fi->fib_dead)) continue; if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (unlikely(err < 0)) { out_reject: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif trace_fib_table_lookup(tb->tb_id, flp, NULL, err); return err; } if (fi->fib_flags & RTNH_F_DEAD) continue; if (unlikely(fi->nh)) { if (nexthop_is_blackhole(fi->nh)) { err = fib_props[RTN_BLACKHOLE].error; goto out_reject; } nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp, &nhsel); if (nhc) goto set_result; goto miss; } for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { nhc = fib_info_nhc(fi, nhsel); if (!fib_lookup_good_nhc(nhc, fib_flags, flp)) continue; set_result: if (!(fib_flags & FIB_LOOKUP_NOREF)) refcount_inc(&fi->fib_clntref); res->prefix = htonl(n->key); res->prefixlen = KEYLENGTH - fa->fa_slen; res->nh_sel = nhsel; res->nhc = nhc; res->type = fa->fa_type; res->scope = fi->fib_scope; res->dscp = fa->fa_dscp; res->fi = fi; res->table = tb; res->fa_head = &n->leaf; #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif trace_fib_table_lookup(tb->tb_id, flp, nhc, err); return err; } } miss: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_miss); #endif goto backtrace; } EXPORT_SYMBOL_GPL(fib_table_lookup); static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old) { /* record the location of the previous list_info entry */ struct hlist_node **pprev = old->fa_list.pprev; struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next); /* remove the fib_alias from the list */ hlist_del_rcu(&old->fa_list); /* if we emptied the list this leaf will be freed and we can sort * out parent suffix lengths as a part of trie_rebalance */ if (hlist_empty(&l->leaf)) { if (tp->slen == l->slen) node_pull_suffix(tp, tp->pos); put_child_root(tp, l->key, NULL); node_free(l); trie_rebalance(t, tp); return; } /* only access fa if it is pointing at the last valid hlist_node */ if (*pprev) return; /* update the trie with the latest suffix length */ l->slen = fa->fa_slen; node_pull_suffix(tp, fa->fa_slen); } static void fib_notify_alias_delete(struct net *net, u32 key, struct hlist_head *fah, struct fib_alias *fa_to_delete, struct netlink_ext_ack *extack) { struct fib_alias *fa_next, *fa_to_notify; u32 tb_id = fa_to_delete->tb_id; u8 slen = fa_to_delete->fa_slen; enum fib_event_type fib_event; /* Do not notify if we do not care about the route. */ if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete) return; /* Determine if the route should be replaced by the next route in the * list. */ fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, struct fib_alias, fa_list); if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { fib_event = FIB_EVENT_ENTRY_REPLACE; fa_to_notify = fa_next; } else { fib_event = FIB_EVENT_ENTRY_DEL; fa_to_notify = fa_to_delete; } call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, fa_to_notify, extack); } /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; struct key_vector *l, *tp; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; dscp_t dscp; u32 key; key = ntohl(cfg->fc_dst); if (!fib_valid_key_len(key, plen, extack)) return -EINVAL; l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; dscp = cfg->fc_dscp; fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false); if (!fa) return -ESRCH; pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen, inet_dscp_to_dsfield(dscp), t); fa_to_delete = NULL; hlist_for_each_entry_from(fa, fa_list) { struct fib_info *fi = fa->fa_info; if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || (fa->fa_dscp != dscp)) break; if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && (cfg->fc_scope == RT_SCOPE_NOWHERE || fa->fa_info->fib_scope == cfg->fc_scope) && (!cfg->fc_prefsrc || fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && fib_nh_match(net, cfg, fi, extack) == 0 && fib_metrics_match(cfg, fi)) { fa_to_delete = fa; break; } } if (!fa_to_delete) return -ESRCH; fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); if (!plen) tb->tb_num_default--; fib_remove_alias(t, tp, l, fa_to_delete); if (fa_to_delete->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa_to_delete->fa_info); alias_free_mem_rcu(fa_to_delete); return 0; } /* Scan for the next leaf starting at the provided key value */ static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key) { struct key_vector *pn, *n = *tn; unsigned long cindex; /* this loop is meant to try and find the key in the trie */ do { /* record parent and next child index */ pn = n; cindex = (key > pn->key) ? get_index(key, pn) : 0; if (cindex >> pn->bits) break; /* descend into the next child */ n = get_child_rcu(pn, cindex++); if (!n) break; /* guarantee forward progress on the keys */ if (IS_LEAF(n) && (n->key >= key)) goto found; } while (IS_TNODE(n)); /* this loop will search for the next leaf with a greater key */ while (!IS_TRIE(pn)) { /* if we exhausted the parent node we will need to climb */ if (cindex >= (1ul << pn->bits)) { t_key pkey = pn->key; pn = node_parent_rcu(pn); cindex = get_index(pkey, pn) + 1; continue; } /* grab the next available node */ n = get_child_rcu(pn, cindex++); if (!n) continue; /* no need to compare keys since we bumped the index */ if (IS_LEAF(n)) goto found; /* Rescan start scanning in new node */ pn = n; cindex = 0; } *tn = pn; return NULL; /* Root of trie */ found: /* if we are at the limit for keys just return NULL for the tnode */ *tn = pn; return n; } static void fib_trie_free(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; /* walk trie in reverse order and free everything */ for (;;) { struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; if (IS_TRIE(pn)) break; n = pn; pn = node_parent(pn); /* drop emptied tnode */ put_child_root(pn, n->key, NULL); node_free(n); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { hlist_del_rcu(&fa->fa_list); alias_free_mem_rcu(fa); } put_child_root(pn, n->key, NULL); node_free(n); } #ifdef CONFIG_IP_FIB_TRIE_STATS free_percpu(t->stats); #endif kfree(tb); } struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) { struct trie *ot = (struct trie *)oldtb->tb_data; struct key_vector *l, *tp = ot->kv; struct fib_table *local_tb; struct fib_alias *fa; struct trie *lt; t_key key = 0; if (oldtb->tb_data == oldtb->__data) return oldtb; local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL); if (!local_tb) return NULL; lt = (struct trie *)local_tb->tb_data; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { struct key_vector *local_l = NULL, *local_tp; hlist_for_each_entry(fa, &l->leaf, fa_list) { struct fib_alias *new_fa; if (local_tb->tb_id != fa->tb_id) continue; /* clone fa for new local table */ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; memcpy(new_fa, fa, sizeof(*fa)); /* insert clone into table */ if (!local_l) local_l = fib_find_node(lt, &local_tp, l->key); if (fib_insert_alias(lt, local_tp, local_l, new_fa, NULL, l->key)) { kmem_cache_free(fn_alias_kmem, new_fa); goto out; } } /* stop loop if key wrapped back to 0 */ key = l->key + 1; if (key < l->key) break; } return local_tb; out: fib_trie_free(local_tb); return NULL; } /* Caller must hold RTNL */ void fib_table_flush_external(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; /* walk trie in reverse order */ for (;;) { unsigned char slen = 0; struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; /* cannot resize the trie vector */ if (IS_TRIE(pn)) break; /* update the suffix to address pulled leaves */ if (pn->slen > pn->pos) update_suffix(pn); /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { /* if alias was cloned to local then we just * need to remove the local copy from main */ if (tb->tb_id != fa->tb_id) { hlist_del_rcu(&fa->fa_list); alias_free_mem_rcu(fa); continue; } /* record local slen */ slen = fa->fa_slen; } /* update leaf slen */ n->slen = slen; if (hlist_empty(&n->leaf)) { put_child_root(pn, n->key, NULL); node_free(n); } } } /* Caller must hold RTNL. */ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; struct nl_info info = { .nl_net = net }; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; int found = 0; /* walk trie in reverse order */ for (;;) { unsigned char slen = 0; struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; /* cannot resize the trie vector */ if (IS_TRIE(pn)) break; /* update the suffix to address pulled leaves */ if (pn->slen > pn->pos) update_suffix(pn); /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi || tb->tb_id != fa->tb_id || (!(fi->fib_flags & RTNH_F_DEAD) && !fib_props[fa->fa_type].error)) { slen = fa->fa_slen; continue; } /* Do not flush error routes if network namespace is * not being dismantled */ if (!flush_all && fib_props[fa->fa_type].error) { slen = fa->fa_slen; continue; } fib_notify_alias_delete(net, n->key, &n->leaf, fa, NULL); if (fi->pfsrc_removed) rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); found++; } /* update leaf slen */ n->slen = slen; if (hlist_empty(&n->leaf)) { put_child_root(pn, n->key, NULL); node_free(n); } } pr_debug("trie_flush found=%d\n", found); return found; } /* derived from fib_trie_free */ static void __fib_info_notify_update(struct net *net, struct fib_table *tb, struct nl_info *info) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct fib_alias *fa; for (;;) { struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; if (IS_TRIE(pn)) break; pn = node_parent(pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id) continue; rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, info, NLM_F_REPLACE); } } } void fib_info_notify_update(struct net *net, struct nl_info *info) { unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) __fib_info_notify_update(net, tb, info); } } static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib_alias *fa; int last_slen = -1; int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi) continue; /* local and main table can share the same trie, * so don't notify twice for the same entry. */ if (tb->tb_id != fa->tb_id) continue; if (fa->fa_slen == last_slen) continue; last_slen = fa->fa_slen; err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE, l->key, KEYLENGTH - fa->fa_slen, fa, extack); if (err) return err; } return 0; } static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { err = fib_leaf_notify(l, tb, nb, extack); if (err) return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } return 0; } int fib_notify(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { unsigned int h; int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { err = fib_table_notify(tb, nb, extack); if (err) return err; } } return 0; } static void __trie_free_rcu(struct rcu_head *head) { struct fib_table *tb = container_of(head, struct fib_table, rcu); #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie *t = (struct trie *)tb->tb_data; if (tb->tb_data == tb->__data) free_percpu(t->stats); #endif /* CONFIG_IP_FIB_TRIE_STATS */ kfree(tb); } void fib_free_table(struct fib_table *tb) { call_rcu(&tb->rcu, __trie_free_rcu); } static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter) { unsigned int flags = NLM_F_MULTI; __be32 xkey = htonl(l->key); int i, s_i, i_fa, s_fa, err; struct fib_alias *fa; if (filter->filter_set || !filter->dump_exceptions || !filter->dump_routes) flags |= NLM_F_DUMP_FILTERED; s_i = cb->args[4]; s_fa = cb->args[5]; i = 0; /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (i < s_i) goto next; i_fa = 0; if (tb->tb_id != fa->tb_id) goto next; if (filter->filter_set) { if (filter->rt_type && fa->fa_type != filter->rt_type) goto next; if ((filter->protocol && fi->fib_protocol != filter->protocol)) goto next; if (filter->dev && !fib_info_nh_uses_dev(fi, filter->dev)) goto next; } if (filter->dump_routes) { if (!s_fa) { struct fib_rt_info fri; fri.fi = fi; fri.tb_id = tb->tb_id; fri.dst = xkey; fri.dst_len = KEYLENGTH - fa->fa_slen; fri.dscp = fa->fa_dscp; fri.type = fa->fa_type; fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, &fri, flags); if (err < 0) goto stop; } i_fa++; } if (filter->dump_exceptions) { err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi, &i_fa, s_fa, flags); if (err < 0) goto stop; } next: i++; } cb->args[4] = i; return skb->len; stop: cb->args[4] = i; cb->args[5] = i_fa; return err; } /* rcu_read_lock needs to be hold by caller from readside */ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; /* Dump starting at last key. * Note: 0.0.0.0/0 (ie default) is first key. */ int count = cb->args[2]; t_key key = cb->args[3]; /* First time here, count and key are both always 0. Count > 0 * and key == 0 means the dump has wrapped around and we are done. */ if (count && !key) return 0; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { int err; err = fn_trie_dump_leaf(l, tb, skb, cb, filter); if (err < 0) { cb->args[3] = key; cb->args[2] = count; return err; } ++count; key = l->key + 1; memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0])); /* stop loop if key wrapped back to 0 */ if (key < l->key) break; } cb->args[3] = key; cb->args[2] = count; return 0; } void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", LEAF_SIZE, 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); } struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) { struct fib_table *tb; struct trie *t; size_t sz = sizeof(*tb); if (!alias) sz += sizeof(struct trie); tb = kzalloc(sz, GFP_KERNEL); if (!tb) return NULL; tb->tb_id = id; tb->tb_num_default = 0; tb->tb_data = (alias ? alias->__data : tb->__data); if (alias) return tb; t = (struct trie *) tb->tb_data; t->kv[0].pos = KEYLENGTH; t->kv[0].slen = KEYLENGTH; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats = alloc_percpu(struct trie_use_stats); if (!t->stats) { kfree(tb); tb = NULL; } #endif return tb; } #ifdef CONFIG_PROC_FS /* Depth first Trie walk iterator */ struct fib_trie_iter { struct seq_net_private p; struct fib_table *tb; struct key_vector *tnode; unsigned int index; unsigned int depth; }; static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) { unsigned long cindex = iter->index; struct key_vector *pn = iter->tnode; t_key pkey; pr_debug("get_next iter={node=%p index=%d depth=%d}\n", iter->tnode, iter->index, iter->depth); while (!IS_TRIE(pn)) { while (cindex < child_length(pn)) { struct key_vector *n = get_child_rcu(pn, cindex++); if (!n) continue; if (IS_LEAF(n)) { iter->tnode = pn; iter->index = cindex; } else { /* push down one level */ iter->tnode = n; iter->index = 0; ++iter->depth; } return n; } /* Current node exhausted, pop back up */ pkey = pn->key; pn = node_parent_rcu(pn); cindex = get_index(pkey, pn) + 1; --iter->depth; } /* record root node so further searches know we are done */ iter->tnode = pn; iter->index = 0; return NULL; } static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { struct key_vector *n, *pn; if (!t) return NULL; pn = t->kv; n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; if (IS_TNODE(n)) { iter->tnode = n; iter->index = 0; iter->depth = 1; } else { iter->tnode = pn; iter->index = 0; iter->depth = 0; } return n; } static void trie_collect_stats(struct trie *t, struct trie_stat *s) { struct key_vector *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); rcu_read_lock(); for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { struct fib_alias *fa; s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) ++s->prefixes; } else { s->tnodes++; if (n->bits < MAX_STAT_DEPTH) s->nodesizes[n->bits]++; s->nullpointers += tn_info(n)->empty_children; } } rcu_read_unlock(); } /* * This outputs /proc/net/fib_triestats */ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) { unsigned int i, max, pointers, bytes, avdepth; if (stat->leaves) avdepth = stat->totdepth*100 / stat->leaves; else avdepth = 0; seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100); seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); bytes = LEAF_SIZE * stat->leaves; seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); bytes += sizeof(struct fib_alias) * stat->prefixes; seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); bytes += TNODE_SIZE(0) * stat->tnodes; max = MAX_STAT_DEPTH; while (max > 0 && stat->nodesizes[max-1] == 0) max--; pointers = 0; for (i = 1; i < max; i++) if (stat->nodesizes[i] != 0) { seq_printf(seq, " %u: %u", i, stat->nodesizes[i]); pointers += (1<<i) * stat->nodesizes[i]; } seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); bytes += sizeof(struct key_vector *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } #ifdef CONFIG_IP_FIB_TRIE_STATS static void trie_show_usage(struct seq_file *seq, const struct trie_use_stats __percpu *stats) { struct trie_use_stats s = { 0 }; int cpu; /* loop through all of the CPUs and gather up the stats */ for_each_possible_cpu(cpu) { const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu); s.gets += pcpu->gets; s.backtrack += pcpu->backtrack; s.semantic_match_passed += pcpu->semantic_match_passed; s.semantic_match_miss += pcpu->semantic_match_miss; s.null_node_hit += pcpu->null_node_hit; s.resize_node_skipped += pcpu->resize_node_skipped; } seq_printf(seq, "\nCounters:\n---------\n"); seq_printf(seq, "gets = %u\n", s.gets); seq_printf(seq, "backtracks = %u\n", s.backtrack); seq_printf(seq, "semantic match passed = %u\n", s.semantic_match_passed); seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss); seq_printf(seq, "null node hit= %u\n", s.null_node_hit); seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped); } #endif /* CONFIG_IP_FIB_TRIE_STATS */ static void fib_table_print(struct seq_file *seq, struct fib_table *tb) { if (tb->tb_id == RT_TABLE_LOCAL) seq_puts(seq, "Local:\n"); else if (tb->tb_id == RT_TABLE_MAIN) seq_puts(seq, "Main:\n"); else seq_printf(seq, "Id %d:\n", tb->tb_id); } static int fib_triestat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; unsigned int h; seq_printf(seq, "Basic info: size of leaf:" " %zd bytes, size of tnode: %zd bytes.\n", LEAF_SIZE, TNODE_SIZE(0)); rcu_read_lock(); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct trie *t = (struct trie *) tb->tb_data; struct trie_stat stat; if (!t) continue; fib_table_print(seq, tb); trie_collect_stats(t, &stat); trie_show_stats(seq, &stat); #ifdef CONFIG_IP_FIB_TRIE_STATS trie_show_usage(seq, t->stats); #endif } cond_resched_rcu(); } rcu_read_unlock(); return 0; } static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); loff_t idx = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct key_vector *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); n; n = fib_trie_get_next(iter)) if (pos == idx++) { iter->tb = tb; return n; } } } return NULL; } static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return fib_trie_get_idx(seq, *pos); } static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; struct key_vector *n; ++*pos; /* next node in same table */ n = fib_trie_get_next(iter); if (n) return n; /* walk rest of this hash chain */ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { tb = hlist_entry(tb_node, struct fib_table, tb_hlist); n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; } /* new hash chain */ while (++h < FIB_TABLE_HASHSZ) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; } } return NULL; found: iter->tb = tb; return n; } static void fib_trie_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void seq_indent(struct seq_file *seq, int n) { while (n-- > 0) seq_puts(seq, " "); } static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) { switch (s) { case RT_SCOPE_UNIVERSE: return "universe"; case RT_SCOPE_SITE: return "site"; case RT_SCOPE_LINK: return "link"; case RT_SCOPE_HOST: return "host"; case RT_SCOPE_NOWHERE: return "nowhere"; default: snprintf(buf, len, "scope=%d", s); return buf; } } static const char *const rtn_type_names[__RTN_MAX] = { [RTN_UNSPEC] = "UNSPEC", [RTN_UNICAST] = "UNICAST", [RTN_LOCAL] = "LOCAL", [RTN_BROADCAST] = "BROADCAST", [RTN_ANYCAST] = "ANYCAST", [RTN_MULTICAST] = "MULTICAST", [RTN_BLACKHOLE] = "BLACKHOLE", [RTN_UNREACHABLE] = "UNREACHABLE", [RTN_PROHIBIT] = "PROHIBIT", [RTN_THROW] = "THROW", [RTN_NAT] = "NAT", [RTN_XRESOLVE] = "XRESOLVE", }; static inline const char *rtn_type(char *buf, size_t len, unsigned int t) { if (t < __RTN_MAX && rtn_type_names[t]) return rtn_type_names[t]; snprintf(buf, len, "type %u", t); return buf; } /* Pretty print the trie */ static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; struct key_vector *n = v; if (IS_TRIE(node_parent_rcu(n))) fib_table_print(seq, iter->tb); if (IS_TNODE(n)) { __be32 prf = htonl(n->key); seq_indent(seq, iter->depth-1); seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", &prf, KEYLENGTH - n->pos - n->bits, n->bits, tn_info(n)->full_children, tn_info(n)->empty_children); } else { __be32 val = htonl(n->key); struct fib_alias *fa; seq_indent(seq, iter->depth); seq_printf(seq, " |-- %pI4\n", &val); hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { char buf1[32], buf2[32]; seq_indent(seq, iter->depth + 1); seq_printf(seq, " /%zu %s %s", KEYLENGTH - fa->fa_slen, rtn_scope(buf1, sizeof(buf1), fa->fa_info->fib_scope), rtn_type(buf2, sizeof(buf2), fa->fa_type)); if (fa->fa_dscp) seq_printf(seq, " tos=%d", inet_dscp_to_dsfield(fa->fa_dscp)); seq_putc(seq, '\n'); } } return 0; } static const struct seq_operations fib_trie_seq_ops = { .start = fib_trie_seq_start, .next = fib_trie_seq_next, .stop = fib_trie_seq_stop, .show = fib_trie_seq_show, }; struct fib_route_iter { struct seq_net_private p; struct fib_table *main_tb; struct key_vector *tnode; loff_t pos; t_key key; }; static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) { struct key_vector *l, **tp = &iter->tnode; t_key key; /* use cached location of previously found key */ if (iter->pos > 0 && pos >= iter->pos) { key = iter->key; } else { iter->pos = 1; key = 0; } pos -= iter->pos; while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) { key = l->key + 1; iter->pos++; l = NULL; /* handle unlikely case of a key wrap */ if (!key) break; } if (l) iter->key = l->key; /* remember it */ else iter->pos = 0; /* forget it */ return l; } static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct fib_route_iter *iter = seq->private; struct fib_table *tb; struct trie *t; rcu_read_lock(); tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); if (!tb) return NULL; iter->main_tb = tb; t = (struct trie *)tb->tb_data; iter->tnode = t->kv; if (*pos != 0) return fib_route_get_idx(iter, *pos); iter->pos = 0; iter->key = KEY_MAX; return SEQ_START_TOKEN; } static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct key_vector *l = NULL; t_key key = iter->key + 1; ++*pos; /* only allow key of 0 for start of sequence */ if ((v == SEQ_START_TOKEN) || key) l = leaf_walk_rcu(&iter->tnode, key); if (l) { iter->key = l->key; iter->pos++; } else { iter->pos = 0; } return l; } static void fib_route_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { unsigned int flags = 0; if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; if (fi) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); if (nhc->nhc_gw.ipv4) flags |= RTF_GATEWAY; } if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; flags |= RTF_UP; return flags; } /* * This outputs /proc/net/route. * The format of the file is not supposed to be changed * and needs to be same as fib_hash output to avoid breaking * legacy utilities */ static int fib_route_seq_show(struct seq_file *seq, void *v) { struct fib_route_iter *iter = seq->private; struct fib_table *tb = iter->main_tb; struct fib_alias *fa; struct key_vector *l = v; __be32 prefix; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" "\tWindow\tIRTT"); return 0; } prefix = htonl(l->key); hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); if ((fa->fa_type == RTN_BROADCAST) || (fa->fa_type == RTN_MULTICAST)) continue; if (fa->tb_id != tb->tb_id) continue; seq_setwidth(seq, 127); if (fi) { struct fib_nh_common *nhc = fib_info_nhc(fi, 0); __be32 gw = 0; if (nhc->nhc_gw_family == AF_INET) gw = nhc->nhc_gw.ipv4; seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", nhc->nhc_dev ? nhc->nhc_dev->name : "*", prefix, gw, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); } seq_pad(seq, '\n'); } return 0; } static const struct seq_operations fib_route_seq_ops = { .start = fib_route_seq_start, .next = fib_route_seq_next, .stop = fib_route_seq_stop, .show = fib_route_seq_show, }; int __net_init fib_proc_init(struct net *net) { if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops, sizeof(struct fib_trie_iter))) goto out1; if (!proc_create_net_single("fib_triestat", 0444, net->proc_net, fib_triestat_seq_show, NULL)) goto out2; if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops, sizeof(struct fib_route_iter))) goto out3; return 0; out3: remove_proc_entry("fib_triestat", net->proc_net); out2: remove_proc_entry("fib_trie", net->proc_net); out1: return -ENOMEM; } void __net_exit fib_proc_exit(struct net *net) { remove_proc_entry("fib_trie", net->proc_net); remove_proc_entry("fib_triestat", net->proc_net); remove_proc_entry("route", net->proc_net); } #endif /* CONFIG_PROC_FS */
1089 1090 3 1 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de> */ #include <linux/if_arp.h> #include <linux/module.h> #include <net/6lowpan.h> #include <net/addrconf.h> #include "6lowpan_i.h" int lowpan_register_netdevice(struct net_device *dev, enum lowpan_lltypes lltype) { int i, ret; switch (lltype) { case LOWPAN_LLTYPE_IEEE802154: dev->addr_len = EUI64_ADDR_LEN; break; case LOWPAN_LLTYPE_BTLE: dev->addr_len = ETH_ALEN; break; } dev->type = ARPHRD_6LOWPAN; dev->mtu = IPV6_MIN_MTU; lowpan_dev(dev)->lltype = lltype; spin_lock_init(&lowpan_dev(dev)->ctx.lock); for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) lowpan_dev(dev)->ctx.table[i].id = i; dev->ndisc_ops = &lowpan_ndisc_ops; ret = register_netdevice(dev); if (ret < 0) return ret; lowpan_dev_debugfs_init(dev); return ret; } EXPORT_SYMBOL(lowpan_register_netdevice); int lowpan_register_netdev(struct net_device *dev, enum lowpan_lltypes lltype) { int ret; rtnl_lock(); ret = lowpan_register_netdevice(dev, lltype); rtnl_unlock(); return ret; } EXPORT_SYMBOL(lowpan_register_netdev); void lowpan_unregister_netdevice(struct net_device *dev) { unregister_netdevice(dev); lowpan_dev_debugfs_exit(dev); } EXPORT_SYMBOL(lowpan_unregister_netdevice); void lowpan_unregister_netdev(struct net_device *dev) { rtnl_lock(); lowpan_unregister_netdevice(dev); rtnl_unlock(); } EXPORT_SYMBOL(lowpan_unregister_netdev); int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) { struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; /* Set short_addr autoconfiguration if short_addr is present only */ if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) return -1; /* For either address format, all zero addresses MUST NOT be used */ if (wpan_dev->pan_id == cpu_to_le16(0x0000) && wpan_dev->short_addr == cpu_to_le16(0x0000)) return -1; /* Alternatively, if no PAN ID is known, 16 zero bits may be used */ if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) memset(eui, 0, 2); else ieee802154_le16_to_be16(eui, &wpan_dev->pan_id); /* The "Universal/Local" (U/L) bit shall be set to zero */ eui[0] &= ~2; eui[2] = 0; eui[3] = 0xFF; eui[4] = 0xFE; eui[5] = 0; ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr); return 0; } static int lowpan_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct inet6_dev *idev; struct in6_addr addr; int i; if (dev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; idev = __in6_dev_get(dev); if (!idev) return NOTIFY_DONE; switch (event) { case NETDEV_UP: case NETDEV_CHANGE: /* (802.15.4 6LoWPAN short address slaac handling */ if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) { __ipv6_addr_set_half(&addr.s6_addr32[0], htonl(0xFE800000), 0); addrconf_add_linklocal(idev, &addr, 0); } break; case NETDEV_DOWN: for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &lowpan_dev(dev)->ctx.table[i].flags); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block lowpan_notifier = { .notifier_call = lowpan_event, }; static int __init lowpan_module_init(void) { int ret; lowpan_debugfs_init(); ret = register_netdevice_notifier(&lowpan_notifier); if (ret < 0) { lowpan_debugfs_exit(); return ret; } request_module_nowait("nhc_dest"); request_module_nowait("nhc_fragment"); request_module_nowait("nhc_hop"); request_module_nowait("nhc_ipv6"); request_module_nowait("nhc_mobility"); request_module_nowait("nhc_routing"); request_module_nowait("nhc_udp"); return 0; } static void __exit lowpan_module_exit(void) { lowpan_debugfs_exit(); unregister_netdevice_notifier(&lowpan_notifier); } module_init(lowpan_module_init); module_exit(lowpan_module_exit); MODULE_DESCRIPTION("IPv6 over Low-Power Wireless Personal Area Network core module"); MODULE_LICENSE("GPL");
34 34 1631 217 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sched #if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCHED_H #include <linux/kthread.h> #include <linux/sched/numa_balancing.h> #include <linux/tracepoint.h> #include <linux/binfmts.h> /* * Tracepoint for calling kthread_stop, performed to end a kthread: */ TRACE_EVENT(sched_kthread_stop, TP_PROTO(struct task_struct *t), TP_ARGS(t), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) ), TP_fast_assign( memcpy(__entry->comm, t->comm, TASK_COMM_LEN); __entry->pid = t->pid; ), TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid) ); /* * Tracepoint for the return value of the kthread stopping: */ TRACE_EVENT(sched_kthread_stop_ret, TP_PROTO(int ret), TP_ARGS(ret), TP_STRUCT__entry( __field( int, ret ) ), TP_fast_assign( __entry->ret = ret; ), TP_printk("ret=%d", __entry->ret) ); /** * sched_kthread_work_queue_work - called when a work gets queued * @worker: pointer to the kthread_worker * @work: pointer to struct kthread_work * * This event occurs when a work is queued immediately or once a * delayed work is actually queued (ie: once the delay has been * reached). */ TRACE_EVENT(sched_kthread_work_queue_work, TP_PROTO(struct kthread_worker *worker, struct kthread_work *work), TP_ARGS(worker, work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) __field( void *, worker) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; __entry->worker = worker; ), TP_printk("work struct=%p function=%ps worker=%p", __entry->work, __entry->function, __entry->worker) ); /** * sched_kthread_work_execute_start - called immediately before the work callback * @work: pointer to struct kthread_work * * Allows to track kthread work execution. */ TRACE_EVENT(sched_kthread_work_execute_start, TP_PROTO(struct kthread_work *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /** * sched_kthread_work_execute_end - called immediately after the work callback * @work: pointer to struct work_struct * @function: pointer to worker function * * Allows to track workqueue execution. */ TRACE_EVENT(sched_kthread_work_execute_end, TP_PROTO(struct kthread_work *work, kthread_work_func_t function), TP_ARGS(work, function), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = function; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /* * Tracepoint for waking up a task: */ DECLARE_EVENT_CLASS(sched_wakeup_template, TP_PROTO(struct task_struct *p), TP_ARGS(__perf_task(p)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) __field( int, target_cpu ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->target_cpu = task_cpu(p); ), TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d", __entry->comm, __entry->pid, __entry->prio, __entry->target_cpu) ); /* * Tracepoint called when waking a task; this tracepoint is guaranteed to be * called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_waking, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint called when the task is actually woken; p->state == TASK_RUNNING. * It is not always called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for waking up a new task: */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, TP_PROTO(struct task_struct *p), TP_ARGS(p)); #ifdef CREATE_TRACE_POINTS static inline long __trace_sched_switch_state(bool preempt, unsigned int prev_state, struct task_struct *p) { unsigned int state; #ifdef CONFIG_SCHED_DEBUG BUG_ON(p != current); #endif /* CONFIG_SCHED_DEBUG */ /* * Preemption ignores task state, therefore preempted tasks are always * RUNNING (we will not have dequeued if state != RUNNING). */ if (preempt) return TASK_REPORT_MAX; /* * task_state_index() uses fls() and returns a value from 0-8 range. * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using * it for left shift operation to get the correct task->state * mapping. */ state = __task_state_index(prev_state, p->exit_state); return state ? (1 << (state - 1)) : state; } #endif /* CREATE_TRACE_POINTS */ /* * Tracepoint for task switches, performed by the scheduler: */ TRACE_EVENT(sched_switch, TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state), TP_ARGS(preempt, prev, next, prev_state), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) __field( pid_t, prev_pid ) __field( int, prev_prio ) __field( long, prev_state ) __array( char, next_comm, TASK_COMM_LEN ) __field( pid_t, next_pid ) __field( int, next_prio ) ), TP_fast_assign( memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { TASK_INTERRUPTIBLE, "S" }, { TASK_UNINTERRUPTIBLE, "D" }, { __TASK_STOPPED, "T" }, { __TASK_TRACED, "t" }, { EXIT_DEAD, "X" }, { EXIT_ZOMBIE, "Z" }, { TASK_PARKED, "P" }, { TASK_DEAD, "I" }) : "R", __entry->prev_state & TASK_REPORT_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); /* * Tracepoint for a task being migrated: */ TRACE_EVENT(sched_migrate_task, TP_PROTO(struct task_struct *p, int dest_cpu), TP_ARGS(p, dest_cpu), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) __field( int, orig_cpu ) __field( int, dest_cpu ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; ), TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d", __entry->comm, __entry->pid, __entry->prio, __entry->orig_cpu, __entry->dest_cpu) ); DECLARE_EVENT_CLASS(sched_process_template, TP_PROTO(struct task_struct *p), TP_ARGS(p), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", __entry->comm, __entry->pid, __entry->prio) ); /* * Tracepoint for freeing a task: */ DEFINE_EVENT(sched_process_template, sched_process_free, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for a task exiting: */ DEFINE_EVENT(sched_process_template, sched_process_exit, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for waiting on task to unschedule: */ DEFINE_EVENT(sched_process_template, sched_wait_task, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for a waiting task: */ TRACE_EVENT(sched_process_wait, TP_PROTO(struct pid *pid), TP_ARGS(pid), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) ), TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); __entry->pid = pid_nr(pid); __entry->prio = current->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", __entry->comm, __entry->pid, __entry->prio) ); /* * Tracepoint for kernel_clone: */ TRACE_EVENT(sched_process_fork, TP_PROTO(struct task_struct *parent, struct task_struct *child), TP_ARGS(parent, child), TP_STRUCT__entry( __array( char, parent_comm, TASK_COMM_LEN ) __field( pid_t, parent_pid ) __array( char, child_comm, TASK_COMM_LEN ) __field( pid_t, child_pid ) ), TP_fast_assign( memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); __entry->parent_pid = parent->pid; memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); __entry->child_pid = child->pid; ), TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d", __entry->parent_comm, __entry->parent_pid, __entry->child_comm, __entry->child_pid) ); /* * Tracepoint for exec: */ TRACE_EVENT(sched_process_exec, TP_PROTO(struct task_struct *p, pid_t old_pid, struct linux_binprm *bprm), TP_ARGS(p, old_pid, bprm), TP_STRUCT__entry( __string( filename, bprm->filename ) __field( pid_t, pid ) __field( pid_t, old_pid ) ), TP_fast_assign( __assign_str(filename); __entry->pid = p->pid; __entry->old_pid = old_pid; ), TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename), __entry->pid, __entry->old_pid) ); /** * sched_prepare_exec - called before setting up new exec * @task: pointer to the current task * @bprm: pointer to linux_binprm used for new exec * * Called before flushing the old exec, where @task is still unchanged, but at * the point of no return during switching to the new exec. At the point it is * called the exec will either succeed, or on failure terminate the task. Also * see the "sched_process_exec" tracepoint, which is called right after @task * has successfully switched to the new exec. */ TRACE_EVENT(sched_prepare_exec, TP_PROTO(struct task_struct *task, struct linux_binprm *bprm), TP_ARGS(task, bprm), TP_STRUCT__entry( __string( interp, bprm->interp ) __string( filename, bprm->filename ) __field( pid_t, pid ) __string( comm, task->comm ) ), TP_fast_assign( __assign_str(interp); __assign_str(filename); __entry->pid = task->pid; __assign_str(comm); ), TP_printk("interp=%s filename=%s pid=%d comm=%s", __get_str(interp), __get_str(filename), __entry->pid, __get_str(comm)) ); #ifdef CONFIG_SCHEDSTATS #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT #define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS #else #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT_NOP #define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS_NOP #endif /* * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE * adding sched_stat support to SCHED_FIFO/RR would be welcome. */ DECLARE_EVENT_CLASS_SCHEDSTAT(sched_stat_template, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(__perf_task(tsk), __perf_count(delay)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( u64, delay ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->delay = delay; ), TP_printk("comm=%s pid=%d delay=%Lu [ns]", __entry->comm, __entry->pid, (unsigned long long)__entry->delay) ); /* * Tracepoint for accounting wait time (time the task is runnable * but not actually running due to scheduler contention). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_wait, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting sleep time (time the task is not runnable, * including iowait, see below). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_sleep, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting iowait time (time the task is not runnable * due to waiting on IO to complete). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_iowait, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting blocked time (time the task is in uninterruptible). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting runtime (time the task is executing * on a CPU). */ DECLARE_EVENT_CLASS(sched_stat_runtime, TP_PROTO(struct task_struct *tsk, u64 runtime), TP_ARGS(tsk, __perf_count(runtime)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( u64, runtime ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->runtime = runtime; ), TP_printk("comm=%s pid=%d runtime=%Lu [ns]", __entry->comm, __entry->pid, (unsigned long long)__entry->runtime) ); DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime, TP_PROTO(struct task_struct *tsk, u64 runtime), TP_ARGS(tsk, runtime)); /* * Tracepoint for showing priority inheritance modifying a tasks * priority. */ TRACE_EVENT(sched_pi_setprio, TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task), TP_ARGS(tsk, pi_task), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, oldprio ) __field( int, newprio ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->oldprio = tsk->prio; __entry->newprio = pi_task ? min(tsk->normal_prio, pi_task->prio) : tsk->normal_prio; /* XXX SCHED_DEADLINE bits missing */ ), TP_printk("comm=%s pid=%d oldprio=%d newprio=%d", __entry->comm, __entry->pid, __entry->oldprio, __entry->newprio) ); #ifdef CONFIG_DETECT_HUNG_TASK TRACE_EVENT(sched_process_hang, TP_PROTO(struct task_struct *tsk), TP_ARGS(tsk), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; ), TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid) ); #endif /* CONFIG_DETECT_HUNG_TASK */ /* * Tracks migration of tasks from one runqueue to another. Can be used to * detect if automatic NUMA balancing is bouncing between nodes. */ TRACE_EVENT(sched_move_numa, TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), TP_ARGS(tsk, src_cpu, dst_cpu), TP_STRUCT__entry( __field( pid_t, pid ) __field( pid_t, tgid ) __field( pid_t, ngid ) __field( int, src_cpu ) __field( int, src_nid ) __field( int, dst_cpu ) __field( int, dst_nid ) ), TP_fast_assign( __entry->pid = task_pid_nr(tsk); __entry->tgid = task_tgid_nr(tsk); __entry->ngid = task_numa_group_id(tsk); __entry->src_cpu = src_cpu; __entry->src_nid = cpu_to_node(src_cpu); __entry->dst_cpu = dst_cpu; __entry->dst_nid = cpu_to_node(dst_cpu); ), TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d", __entry->pid, __entry->tgid, __entry->ngid, __entry->src_cpu, __entry->src_nid, __entry->dst_cpu, __entry->dst_nid) ); DECLARE_EVENT_CLASS(sched_numa_pair_template, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu), TP_STRUCT__entry( __field( pid_t, src_pid ) __field( pid_t, src_tgid ) __field( pid_t, src_ngid ) __field( int, src_cpu ) __field( int, src_nid ) __field( pid_t, dst_pid ) __field( pid_t, dst_tgid ) __field( pid_t, dst_ngid ) __field( int, dst_cpu ) __field( int, dst_nid ) ), TP_fast_assign( __entry->src_pid = task_pid_nr(src_tsk); __entry->src_tgid = task_tgid_nr(src_tsk); __entry->src_ngid = task_numa_group_id(src_tsk); __entry->src_cpu = src_cpu; __entry->src_nid = cpu_to_node(src_cpu); __entry->dst_pid = dst_tsk ? task_pid_nr(dst_tsk) : 0; __entry->dst_tgid = dst_tsk ? task_tgid_nr(dst_tsk) : 0; __entry->dst_ngid = dst_tsk ? task_numa_group_id(dst_tsk) : 0; __entry->dst_cpu = dst_cpu; __entry->dst_nid = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1; ), TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d", __entry->src_pid, __entry->src_tgid, __entry->src_ngid, __entry->src_cpu, __entry->src_nid, __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, __entry->dst_cpu, __entry->dst_nid) ); DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) ); DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) ); #ifdef CONFIG_NUMA_BALANCING #define NUMAB_SKIP_REASON \ EM( NUMAB_SKIP_UNSUITABLE, "unsuitable" ) \ EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \ EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \ EM( NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) \ EMe(NUMAB_SKIP_SEQ_COMPLETED, "seq_completed" ) /* Redefine for export. */ #undef EM #undef EMe #define EM(a, b) TRACE_DEFINE_ENUM(a); #define EMe(a, b) TRACE_DEFINE_ENUM(a); NUMAB_SKIP_REASON /* Redefine for symbolic printing. */ #undef EM #undef EMe #define EM(a, b) { a, b }, #define EMe(a, b) { a, b } TRACE_EVENT(sched_skip_vma_numa, TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma, enum numa_vmaskip_reason reason), TP_ARGS(mm, vma, reason), TP_STRUCT__entry( __field(unsigned long, numa_scan_offset) __field(unsigned long, vm_start) __field(unsigned long, vm_end) __field(enum numa_vmaskip_reason, reason) ), TP_fast_assign( __entry->numa_scan_offset = mm->numa_scan_offset; __entry->vm_start = vma->vm_start; __entry->vm_end = vma->vm_end; __entry->reason = reason; ), TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s", __entry->numa_scan_offset, __entry->vm_start, __entry->vm_end, __print_symbolic(__entry->reason, NUMAB_SKIP_REASON)) ); #endif /* CONFIG_NUMA_BALANCING */ /* * Tracepoint for waking a polling cpu without an IPI. */ TRACE_EVENT(sched_wake_idle_without_ipi, TP_PROTO(int cpu), TP_ARGS(cpu), TP_STRUCT__entry( __field( int, cpu ) ), TP_fast_assign( __entry->cpu = cpu; ), TP_printk("cpu=%d", __entry->cpu) ); /* * Following tracepoints are not exported in tracefs and provide hooking * mechanisms only for testing and debugging purposes. * * Postfixed with _tp to make them easily identifiable in the code. */ DECLARE_TRACE(pelt_cfs_tp, TP_PROTO(struct cfs_rq *cfs_rq), TP_ARGS(cfs_rq)); DECLARE_TRACE(pelt_rt_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_dl_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_hw_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_irq_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_se_tp, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); DECLARE_TRACE(sched_cpu_capacity_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(sched_overutilized_tp, TP_PROTO(struct root_domain *rd, bool overutilized), TP_ARGS(rd, overutilized)); DECLARE_TRACE(sched_util_est_cfs_tp, TP_PROTO(struct cfs_rq *cfs_rq), TP_ARGS(cfs_rq)); DECLARE_TRACE(sched_util_est_se_tp, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); DECLARE_TRACE(sched_update_nr_running_tp, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change)); DECLARE_TRACE(sched_compute_energy_tp, TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy, unsigned long max_util, unsigned long busy_time), TP_ARGS(p, dst_cpu, energy, max_util, busy_time)); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
294 292 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 // SPDX-License-Identifier: GPL-2.0-or-later /* auditsc.c -- System-call auditing support * Handles all system-call specific auditing features. * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright (C) 2005, 2006 IBM Corporation * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * * Many of the ideas implemented here are from Stephen C. Tweedie, * especially the idea of avoiding a copy by using getname. * * The method for actual interception of syscall entry and exit (not in * this file -- see entry.S) is based on a GPL'd patch written by * okir@suse.de and Copyright 2003 SuSE Linux AG. * * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>, * 2006. * * The support of additional filter rules compares (>, <, >=, <=) was * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. * * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional * filesystem information. * * Subject and object context labeling support added by <danjones@us.ibm.com> * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> #include <asm/types.h> #include <linux/atomic.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/mount.h> #include <linux/socket.h> #include <linux/mqueue.h> #include <linux/audit.h> #include <linux/personality.h> #include <linux/time.h> #include <linux/netlink.h> #include <linux/compiler.h> #include <asm/unistd.h> #include <linux/security.h> #include <linux/list.h> #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <asm/syscall.h> #include <linux/capability.h> #include <linux/fs_struct.h> #include <linux/compat.h> #include <linux/ctype.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/fsnotify_backend.h> #include <uapi/linux/limits.h> #include <uapi/linux/netfilter/nf_tables.h> #include <uapi/linux/openat2.h> // struct open_how #include <uapi/linux/fanotify.h> #include "audit.h" /* flags stating the success for a syscall */ #define AUDITSC_INVALID 0 #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 /* no execve audit message should be longer than this (userspace limits), * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ #define MAX_PROCTITLE_AUDIT_LEN 128 /* number of audit rules */ int audit_n_rules; /* determines whether we collect data for signals sent */ int audit_signals; struct audit_aux_data { struct audit_aux_data *next; int type; }; /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; kuid_t target_auid[AUDIT_AUX_PIDS]; kuid_t target_uid[AUDIT_AUX_PIDS]; unsigned int target_sessionid[AUDIT_AUX_PIDS]; struct lsm_prop target_ref[AUDIT_AUX_PIDS]; char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; int pid_count; }; struct audit_aux_data_bprm_fcaps { struct audit_aux_data d; struct audit_cap_data fcap; unsigned int fcap_ver; struct audit_cap_data old_pcap; struct audit_cap_data new_pcap; }; struct audit_tree_refs { struct audit_tree_refs *next; struct audit_chunk *c[31]; }; struct audit_nfcfgop_tab { enum audit_nfcfgop op; const char *s; }; static const struct audit_nfcfgop_tab audit_nfcfgs[] = { { AUDIT_XT_OP_REGISTER, "xt_register" }, { AUDIT_XT_OP_REPLACE, "xt_replace" }, { AUDIT_XT_OP_UNREGISTER, "xt_unregister" }, { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" }, { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" }, { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" }, { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" }, { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" }, { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" }, { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" }, { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" }, { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" }, { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" }, { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" }, { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" }, { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" }, { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" }, { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" }, { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" }, { AUDIT_NFT_OP_SETELEM_RESET, "nft_reset_setelem" }, { AUDIT_NFT_OP_RULE_RESET, "nft_reset_rule" }, { AUDIT_NFT_OP_INVALID, "nft_invalid" }, }; static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; if (unlikely(!ctx)) return 0; n = ctx->major; switch (audit_classify_syscall(ctx->arch, n)) { case AUDITSC_NATIVE: if ((mask & AUDIT_PERM_WRITE) && audit_match_class(AUDIT_CLASS_WRITE, n)) return 1; if ((mask & AUDIT_PERM_READ) && audit_match_class(AUDIT_CLASS_READ, n)) return 1; if ((mask & AUDIT_PERM_ATTR) && audit_match_class(AUDIT_CLASS_CHATTR, n)) return 1; return 0; case AUDITSC_COMPAT: /* 32bit on biarch */ if ((mask & AUDIT_PERM_WRITE) && audit_match_class(AUDIT_CLASS_WRITE_32, n)) return 1; if ((mask & AUDIT_PERM_READ) && audit_match_class(AUDIT_CLASS_READ_32, n)) return 1; if ((mask & AUDIT_PERM_ATTR) && audit_match_class(AUDIT_CLASS_CHATTR_32, n)) return 1; return 0; case AUDITSC_OPEN: return mask & ACC_MODE(ctx->argv[1]); case AUDITSC_OPENAT: return mask & ACC_MODE(ctx->argv[2]); case AUDITSC_SOCKETCALL: return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND); case AUDITSC_EXECVE: return mask & AUDIT_PERM_EXEC; case AUDITSC_OPENAT2: return mask & ACC_MODE((u32)ctx->openat2.flags); default: return 0; } } static int audit_match_filetype(struct audit_context *ctx, int val) { struct audit_names *n; umode_t mode = (umode_t)val; if (unlikely(!ctx)) return 0; list_for_each_entry(n, &ctx->names_list, list) { if ((n->ino != AUDIT_INO_UNSET) && ((n->mode & S_IFMT) == mode)) return 1; } return 0; } /* * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; * ->first_trees points to its beginning, ->trees - to the current end of data. * ->tree_count is the number of free entries in array pointed to by ->trees. * Original condition is (NULL, NULL, 0); as soon as it grows we never revert to NULL, * "empty" becomes (p, p, 31) afterwards. We don't shrink the list (and seriously, * it's going to remain 1-element for almost any setup) until we free context itself. * References in it _are_ dropped - at the same time we free/drop aux stuff. */ static void audit_set_auditable(struct audit_context *ctx) { if (!ctx->prio) { ctx->prio = 1; ctx->current_state = AUDIT_STATE_RECORD; } } static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk) { struct audit_tree_refs *p = ctx->trees; int left = ctx->tree_count; if (likely(left)) { p->c[--left] = chunk; ctx->tree_count = left; return 1; } if (!p) return 0; p = p->next; if (p) { p->c[30] = chunk; ctx->trees = p; ctx->tree_count = 30; return 1; } return 0; } static int grow_tree_refs(struct audit_context *ctx) { struct audit_tree_refs *p = ctx->trees; ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL); if (!ctx->trees) { ctx->trees = p; return 0; } if (p) p->next = ctx->trees; else ctx->first_trees = ctx->trees; ctx->tree_count = 31; return 1; } static void unroll_tree_refs(struct audit_context *ctx, struct audit_tree_refs *p, int count) { struct audit_tree_refs *q; int n; if (!p) { /* we started with empty chain */ p = ctx->first_trees; count = 31; /* if the very first allocation has failed, nothing to do */ if (!p) return; } n = count; for (q = p; q != ctx->trees; q = q->next, n = 31) { while (n--) { audit_put_chunk(q->c[n]); q->c[n] = NULL; } } while (n-- > ctx->tree_count) { audit_put_chunk(q->c[n]); q->c[n] = NULL; } ctx->trees = p; ctx->tree_count = count; } static void free_tree_refs(struct audit_context *ctx) { struct audit_tree_refs *p, *q; for (p = ctx->first_trees; p; p = q) { q = p->next; kfree(p); } } static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) { struct audit_tree_refs *p; int n; if (!tree) return 0; /* full ones */ for (p = ctx->first_trees; p != ctx->trees; p = p->next) { for (n = 0; n < 31; n++) if (audit_tree_match(p->c[n], tree)) return 1; } /* partial */ if (p) { for (n = ctx->tree_count; n < 31; n++) if (audit_tree_match(p->c[n], tree)) return 1; } return 0; } static int audit_compare_uid(kuid_t uid, struct audit_names *name, struct audit_field *f, struct audit_context *ctx) { struct audit_names *n; int rc; if (name) { rc = audit_uid_comparator(uid, f->op, name->uid); if (rc) return rc; } if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { rc = audit_uid_comparator(uid, f->op, n->uid); if (rc) return rc; } } return 0; } static int audit_compare_gid(kgid_t gid, struct audit_names *name, struct audit_field *f, struct audit_context *ctx) { struct audit_names *n; int rc; if (name) { rc = audit_gid_comparator(gid, f->op, name->gid); if (rc) return rc; } if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { rc = audit_gid_comparator(gid, f->op, n->gid); if (rc) return rc; } } return 0; } static int audit_field_compare(struct task_struct *tsk, const struct cred *cred, struct audit_field *f, struct audit_context *ctx, struct audit_names *name) { switch (f->val) { /* process to file object comparisons */ case AUDIT_COMPARE_UID_TO_OBJ_UID: return audit_compare_uid(cred->uid, name, f, ctx); case AUDIT_COMPARE_GID_TO_OBJ_GID: return audit_compare_gid(cred->gid, name, f, ctx); case AUDIT_COMPARE_EUID_TO_OBJ_UID: return audit_compare_uid(cred->euid, name, f, ctx); case AUDIT_COMPARE_EGID_TO_OBJ_GID: return audit_compare_gid(cred->egid, name, f, ctx); case AUDIT_COMPARE_AUID_TO_OBJ_UID: return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx); case AUDIT_COMPARE_SUID_TO_OBJ_UID: return audit_compare_uid(cred->suid, name, f, ctx); case AUDIT_COMPARE_SGID_TO_OBJ_GID: return audit_compare_gid(cred->sgid, name, f, ctx); case AUDIT_COMPARE_FSUID_TO_OBJ_UID: return audit_compare_uid(cred->fsuid, name, f, ctx); case AUDIT_COMPARE_FSGID_TO_OBJ_GID: return audit_compare_gid(cred->fsgid, name, f, ctx); /* uid comparisons */ case AUDIT_COMPARE_UID_TO_AUID: return audit_uid_comparator(cred->uid, f->op, audit_get_loginuid(tsk)); case AUDIT_COMPARE_UID_TO_EUID: return audit_uid_comparator(cred->uid, f->op, cred->euid); case AUDIT_COMPARE_UID_TO_SUID: return audit_uid_comparator(cred->uid, f->op, cred->suid); case AUDIT_COMPARE_UID_TO_FSUID: return audit_uid_comparator(cred->uid, f->op, cred->fsuid); /* auid comparisons */ case AUDIT_COMPARE_AUID_TO_EUID: return audit_uid_comparator(audit_get_loginuid(tsk), f->op, cred->euid); case AUDIT_COMPARE_AUID_TO_SUID: return audit_uid_comparator(audit_get_loginuid(tsk), f->op, cred->suid); case AUDIT_COMPARE_AUID_TO_FSUID: return audit_uid_comparator(audit_get_loginuid(tsk), f->op, cred->fsuid); /* euid comparisons */ case AUDIT_COMPARE_EUID_TO_SUID: return audit_uid_comparator(cred->euid, f->op, cred->suid); case AUDIT_COMPARE_EUID_TO_FSUID: return audit_uid_comparator(cred->euid, f->op, cred->fsuid); /* suid comparisons */ case AUDIT_COMPARE_SUID_TO_FSUID: return audit_uid_comparator(cred->suid, f->op, cred->fsuid); /* gid comparisons */ case AUDIT_COMPARE_GID_TO_EGID: return audit_gid_comparator(cred->gid, f->op, cred->egid); case AUDIT_COMPARE_GID_TO_SGID: return audit_gid_comparator(cred->gid, f->op, cred->sgid); case AUDIT_COMPARE_GID_TO_FSGID: return audit_gid_comparator(cred->gid, f->op, cred->fsgid); /* egid comparisons */ case AUDIT_COMPARE_EGID_TO_SGID: return audit_gid_comparator(cred->egid, f->op, cred->sgid); case AUDIT_COMPARE_EGID_TO_FSGID: return audit_gid_comparator(cred->egid, f->op, cred->fsgid); /* sgid comparison */ case AUDIT_COMPARE_SGID_TO_FSGID: return audit_gid_comparator(cred->sgid, f->op, cred->fsgid); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; } return 0; } /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. * * If task_creation is true, this is an explicit indication that we are * filtering a task rule at task creation time. This and tsk == current are * the only situations where tsk->cred may be accessed without an rcu read lock. */ static int audit_filter_rules(struct task_struct *tsk, struct audit_krule *rule, struct audit_context *ctx, struct audit_names *name, enum audit_state *state, bool task_creation) { const struct cred *cred; int i, need_sid = 1; struct lsm_prop prop = { }; unsigned int sessionid; if (ctx && rule->prio <= ctx->prio) return 0; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; struct audit_names *n; int result = 0; pid_t pid; switch (f->type) { case AUDIT_PID: pid = task_tgid_nr(tsk); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: if (ctx) { if (!ctx->ppid) ctx->ppid = task_ppid_nr(tsk); result = audit_comparator(ctx->ppid, f->op, f->val); } break; case AUDIT_EXE: result = audit_exe_compare(tsk, rule->exe); if (f->op == Audit_not_equal) result = !result; break; case AUDIT_UID: result = audit_uid_comparator(cred->uid, f->op, f->uid); break; case AUDIT_EUID: result = audit_uid_comparator(cred->euid, f->op, f->uid); break; case AUDIT_SUID: result = audit_uid_comparator(cred->suid, f->op, f->uid); break; case AUDIT_FSUID: result = audit_uid_comparator(cred->fsuid, f->op, f->uid); break; case AUDIT_GID: result = audit_gid_comparator(cred->gid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_EGID: result = audit_gid_comparator(cred->egid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_SGID: result = audit_gid_comparator(cred->sgid, f->op, f->gid); break; case AUDIT_FSGID: result = audit_gid_comparator(cred->fsgid, f->op, f->gid); break; case AUDIT_SESSIONID: sessionid = audit_get_sessionid(tsk); result = audit_comparator(sessionid, f->op, f->val); break; case AUDIT_PERS: result = audit_comparator(tsk->personality, f->op, f->val); break; case AUDIT_ARCH: if (ctx) result = audit_comparator(ctx->arch, f->op, f->val); break; case AUDIT_EXIT: if (ctx && ctx->return_valid != AUDITSC_INVALID) result = audit_comparator(ctx->return_code, f->op, f->val); break; case AUDIT_SUCCESS: if (ctx && ctx->return_valid != AUDITSC_INVALID) { if (f->val) result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS); else result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE); } break; case AUDIT_DEVMAJOR: if (name) { if (audit_comparator(MAJOR(name->dev), f->op, f->val) || audit_comparator(MAJOR(name->rdev), f->op, f->val)) ++result; } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(MAJOR(n->dev), f->op, f->val) || audit_comparator(MAJOR(n->rdev), f->op, f->val)) { ++result; break; } } } break; case AUDIT_DEVMINOR: if (name) { if (audit_comparator(MINOR(name->dev), f->op, f->val) || audit_comparator(MINOR(name->rdev), f->op, f->val)) ++result; } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(MINOR(n->dev), f->op, f->val) || audit_comparator(MINOR(n->rdev), f->op, f->val)) { ++result; break; } } } break; case AUDIT_INODE: if (name) result = audit_comparator(name->ino, f->op, f->val); else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(n->ino, f->op, f->val)) { ++result; break; } } } break; case AUDIT_OBJ_UID: if (name) { result = audit_uid_comparator(name->uid, f->op, f->uid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_uid_comparator(n->uid, f->op, f->uid)) { ++result; break; } } } break; case AUDIT_OBJ_GID: if (name) { result = audit_gid_comparator(name->gid, f->op, f->gid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_gid_comparator(n->gid, f->op, f->gid)) { ++result; break; } } } break; case AUDIT_WATCH: if (name) { result = audit_watch_compare(rule->watch, name->ino, name->dev); if (f->op == Audit_not_equal) result = !result; } break; case AUDIT_DIR: if (ctx) { result = match_tree_refs(ctx, rule->tree); if (f->op == Audit_not_equal) result = !result; } break; case AUDIT_LOGINUID: result = audit_uid_comparator(audit_get_loginuid(tsk), f->op, f->uid); break; case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); break; case AUDIT_SADDR_FAM: if (ctx && ctx->sockaddr) result = audit_comparator(ctx->sockaddr->ss_family, f->op, f->val); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: /* NOTE: this may return negative values indicating a temporary error. We simply treat this as a match for now to avoid losing information that may be wanted. An error message will also be logged upon error */ if (f->lsm_rule) { if (need_sid) { /* @tsk should always be equal to * @current with the exception of * fork()/copy_process() in which case * the new @tsk creds are still a dup * of @current's creds so we can still * use * security_current_getlsmprop_subj() * here even though it always refs * @current's creds */ security_current_getlsmprop_subj(&prop); need_sid = 0; } result = security_audit_rule_match(&prop, f->type, f->op, f->lsm_rule); } break; case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR also applies here */ if (f->lsm_rule) { /* Find files that match */ if (name) { result = security_audit_rule_match( &name->oprop, f->type, f->op, f->lsm_rule); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (security_audit_rule_match( &n->oprop, f->type, f->op, f->lsm_rule)) { ++result; break; } } } /* Find ipc objects that match */ if (!ctx || ctx->type != AUDIT_IPC) break; if (security_audit_rule_match(&ctx->ipc.oprop, f->type, f->op, f->lsm_rule)) ++result; } break; case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: if (ctx) result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); break; case AUDIT_FILTERKEY: /* ignore this field for filtering */ result = 1; break; case AUDIT_PERM: result = audit_match_perm(ctx, f->val); if (f->op == Audit_not_equal) result = !result; break; case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); if (f->op == Audit_not_equal) result = !result; break; case AUDIT_FIELD_COMPARE: result = audit_field_compare(tsk, cred, f, ctx, name); break; } if (!result) return 0; } if (ctx) { if (rule->filterkey) { kfree(ctx->filterkey); ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); } ctx->prio = rule->prio; } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_STATE_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_STATE_RECORD; break; } return 1; } /* At process creation time, we can determine if system-call auditing is * completely disabled for this task. Since we only have the task * structure at this point, we can only check uid and gid. */ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) { struct audit_entry *e; enum audit_state state; rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state, true)) { if (state == AUDIT_STATE_RECORD) *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); rcu_read_unlock(); return state; } } rcu_read_unlock(); return AUDIT_STATE_BUILD; } static int audit_in_mask(const struct audit_krule *rule, unsigned long val) { int word, bit; if (val > 0xffffffff) return false; word = AUDIT_WORD(val); if (word >= AUDIT_BITMASK_SIZE) return false; bit = AUDIT_BIT(val); return rule->mask[word] & bit; } /** * __audit_filter_op - common filter helper for operations (syscall/uring/etc) * @tsk: associated task * @ctx: audit context * @list: audit filter list * @name: audit_name (can be NULL) * @op: current syscall/uring_op * * Run the udit filters specified in @list against @tsk using @ctx, * @name, and @op, as necessary; the caller is responsible for ensuring * that the call is made while the RCU read lock is held. The @name * parameter can be NULL, but all others must be specified. * Returns 1/true if the filter finds a match, 0/false if none are found. */ static int __audit_filter_op(struct task_struct *tsk, struct audit_context *ctx, struct list_head *list, struct audit_names *name, unsigned long op) { struct audit_entry *e; enum audit_state state; list_for_each_entry_rcu(e, list, list) { if (audit_in_mask(&e->rule, op) && audit_filter_rules(tsk, &e->rule, ctx, name, &state, false)) { ctx->current_state = state; return 1; } } return 0; } /** * audit_filter_uring - apply filters to an io_uring operation * @tsk: associated task * @ctx: audit context */ static void audit_filter_uring(struct task_struct *tsk, struct audit_context *ctx) { if (auditd_test_task(tsk)) return; rcu_read_lock(); __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT], NULL, ctx->uring_op); rcu_read_unlock(); } /* At syscall exit time, this filter is called if the audit_state is * not low enough that auditing cannot take place, but is also not * high enough that we already know we have to write an audit record * (i.e., the state is AUDIT_STATE_BUILD). */ static void audit_filter_syscall(struct task_struct *tsk, struct audit_context *ctx) { if (auditd_test_task(tsk)) return; rcu_read_lock(); __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT], NULL, ctx->major); rcu_read_unlock(); } /* * Given an audit_name check the inode hash table to see if they match. * Called holding the rcu read lock to protect the use of audit_inode_hash */ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_names *n, struct audit_context *ctx) { int h = audit_hash_ino((u32)n->ino); struct list_head *list = &audit_inode_hash[h]; return __audit_filter_op(tsk, ctx, list, n, ctx->major); } /* At syscall exit time, this filter is called if any audit_names have been * collected during syscall processing. We only check rules in sublists at hash * buckets applicable to the inode numbers in audit_names. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { struct audit_names *n; if (auditd_test_task(tsk)) return; rcu_read_lock(); list_for_each_entry(n, &ctx->names_list, list) { if (audit_filter_inode_name(tsk, n, ctx)) break; } rcu_read_unlock(); } static inline void audit_proctitle_free(struct audit_context *context) { kfree(context->proctitle.value); context->proctitle.value = NULL; context->proctitle.len = 0; } static inline void audit_free_module(struct audit_context *context) { if (context->type == AUDIT_KERN_MODULE) { kfree(context->module.name); context->module.name = NULL; } } static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; list_for_each_entry_safe(n, next, &context->names_list, list) { list_del(&n->list); if (n->name) putname(n->name); if (n->should_free) kfree(n); } context->name_count = 0; path_put(&context->pwd); context->pwd.dentry = NULL; context->pwd.mnt = NULL; } static inline void audit_free_aux(struct audit_context *context) { struct audit_aux_data *aux; while ((aux = context->aux)) { context->aux = aux->next; kfree(aux); } context->aux = NULL; while ((aux = context->aux_pids)) { context->aux_pids = aux->next; kfree(aux); } context->aux_pids = NULL; } /** * audit_reset_context - reset a audit_context structure * @ctx: the audit_context to reset * * All fields in the audit_context will be reset to an initial state, all * references held by fields will be dropped, and private memory will be * released. When this function returns the audit_context will be suitable * for reuse, so long as the passed context is not NULL or a dummy context. */ static void audit_reset_context(struct audit_context *ctx) { if (!ctx) return; /* if ctx is non-null, reset the "ctx->context" regardless */ ctx->context = AUDIT_CTX_UNUSED; if (ctx->dummy) return; /* * NOTE: It shouldn't matter in what order we release the fields, so * release them in the order in which they appear in the struct; * this gives us some hope of quickly making sure we are * resetting the audit_context properly. * * Other things worth mentioning: * - we don't reset "dummy" * - we don't reset "state", we do reset "current_state" * - we preserve "filterkey" if "state" is AUDIT_STATE_RECORD * - much of this is likely overkill, but play it safe for now * - we really need to work on improving the audit_context struct */ ctx->current_state = ctx->state; ctx->serial = 0; ctx->major = 0; ctx->uring_op = 0; ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; memset(ctx->argv, 0, sizeof(ctx->argv)); ctx->return_code = 0; ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0); ctx->return_valid = AUDITSC_INVALID; audit_free_names(ctx); if (ctx->state != AUDIT_STATE_RECORD) { kfree(ctx->filterkey); ctx->filterkey = NULL; } audit_free_aux(ctx); kfree(ctx->sockaddr); ctx->sockaddr = NULL; ctx->sockaddr_len = 0; ctx->ppid = 0; ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0); ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0); ctx->personality = 0; ctx->arch = 0; ctx->target_pid = 0; ctx->target_auid = ctx->target_uid = KUIDT_INIT(0); ctx->target_sessionid = 0; lsmprop_init(&ctx->target_ref); ctx->target_comm[0] = '\0'; unroll_tree_refs(ctx, NULL, 0); WARN_ON(!list_empty(&ctx->killed_trees)); audit_free_module(ctx); ctx->fds[0] = -1; ctx->type = 0; /* reset last for audit_free_*() */ } static inline struct audit_context *audit_alloc_context(enum audit_state state) { struct audit_context *context; context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return NULL; context->context = AUDIT_CTX_UNUSED; context->state = state; context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0; INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); context->fds[0] = -1; context->return_valid = AUDITSC_INVALID; return context; } /** * audit_alloc - allocate an audit context block for a task * @tsk: task * * Filter on the task information and allocate a per-task audit context * if necessary. Doing so turns on system call auditing for the * specified task. This is called from copy_process, so no lock is * needed. */ int audit_alloc(struct task_struct *tsk) { struct audit_context *context; enum audit_state state; char *key = NULL; if (likely(!audit_ever_enabled)) return 0; state = audit_filter_task(tsk, &key); if (state == AUDIT_STATE_DISABLED) { clear_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } context = audit_alloc_context(state); if (!context) { kfree(key); audit_log_lost("out of memory in audit_alloc"); return -ENOMEM; } context->filterkey = key; audit_set_context(tsk, context); set_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } static inline void audit_free_context(struct audit_context *context) { /* resetting is extra work, but it is likely just noise */ audit_reset_context(context); audit_proctitle_free(context); free_tree_refs(context); kfree(context->filterkey); kfree(context); } static int audit_log_pid_context(struct audit_context *context, pid_t pid, kuid_t auid, kuid_t uid, unsigned int sessionid, struct lsm_prop *prop, char *comm) { struct audit_buffer *ab; char *ctx = NULL; u32 len; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); if (!ab) return rc; audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); if (lsmprop_is_set(prop)) { if (security_lsmprop_to_secctx(prop, &ctx, &len)) { audit_log_format(ab, " obj=(none)"); rc = 1; } else { audit_log_format(ab, " obj=%s", ctx); security_release_secctx(ctx, len); } } audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); return rc; } static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab) { long len_max; long len_rem; long len_full; long len_buf; long len_abuf = 0; long len_tmp; bool require_data; bool encode; unsigned int iter; unsigned int arg; char *buf_head; char *buf; const char __user *p = (const char __user *)current->mm->arg_start; /* NOTE: this buffer needs to be large enough to hold all the non-arg * data we put in the audit record for this argument (see the * code below) ... at this point in time 96 is plenty */ char abuf[96]; /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the * current value of 7500 is not as important as the fact that it * is less than 8k, a setting of 7500 gives us plenty of wiggle * room if we go over a little bit in the logging below */ WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); len_max = MAX_EXECVE_AUDIT_LEN; /* scratch buffer to hold the userspace args */ buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); if (!buf_head) { audit_panic("out of memory for argv string"); return; } buf = buf_head; audit_log_format(*ab, "argc=%d", context->execve.argc); len_rem = len_max; len_buf = 0; len_full = 0; require_data = true; encode = false; iter = 0; arg = 0; do { /* NOTE: we don't ever want to trust this value for anything * serious, but the audit record format insists we * provide an argument length for really long arguments, * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but * to use strncpy_from_user() to obtain this value for * recording in the log, although we don't use it * anywhere here to avoid a double-fetch problem */ if (len_full == 0) len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; /* read more data from userspace */ if (require_data) { /* can we make more room in the buffer? */ if (buf != buf_head) { memmove(buf_head, buf, len_buf); buf = buf_head; } /* fetch as much as we can of the argument */ len_tmp = strncpy_from_user(&buf_head[len_buf], p, len_max - len_buf); if (len_tmp == -EFAULT) { /* unable to copy from userspace */ send_sig(SIGKILL, current, 0); goto out; } else if (len_tmp == (len_max - len_buf)) { /* buffer is not large enough */ require_data = true; /* NOTE: if we are going to span multiple * buffers force the encoding so we stand * a chance at a sane len_full value and * consistent record encoding */ encode = true; len_full = len_full * 2; p += len_tmp; } else { require_data = false; if (!encode) encode = audit_string_contains_control( buf, len_tmp); /* try to use a trusted value for len_full */ if (len_full < len_max) len_full = (encode ? len_tmp * 2 : len_tmp); p += len_tmp + 1; } len_buf += len_tmp; buf_head[len_buf] = '\0'; /* length of the buffer in the audit record? */ len_abuf = (encode ? len_buf * 2 : len_buf + 2); } /* write as much as we can to the audit log */ if (len_buf >= 0) { /* NOTE: some magic numbers here - basically if we * can't fit a reasonable amount of data into the * existing audit buffer, flush it and start with * a new buffer */ if ((sizeof(abuf) + 8) > len_rem) { len_rem = len_max; audit_log_end(*ab); *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); if (!*ab) goto out; } /* create the non-arg portion of the arg record */ len_tmp = 0; if (require_data || (iter > 0) || ((len_abuf + sizeof(abuf)) > len_rem)) { if (iter == 0) { len_tmp += snprintf(&abuf[len_tmp], sizeof(abuf) - len_tmp, " a%d_len=%lu", arg, len_full); } len_tmp += snprintf(&abuf[len_tmp], sizeof(abuf) - len_tmp, " a%d[%d]=", arg, iter++); } else len_tmp += snprintf(&abuf[len_tmp], sizeof(abuf) - len_tmp, " a%d=", arg); WARN_ON(len_tmp >= sizeof(abuf)); abuf[sizeof(abuf) - 1] = '\0'; /* log the arg in the audit record */ audit_log_format(*ab, "%s", abuf); len_rem -= len_tmp; len_tmp = len_buf; if (encode) { if (len_abuf > len_rem) len_tmp = len_rem / 2; /* encoding */ audit_log_n_hex(*ab, buf, len_tmp); len_rem -= len_tmp * 2; len_abuf -= len_tmp * 2; } else { if (len_abuf > len_rem) len_tmp = len_rem - 2; /* quotes */ audit_log_n_string(*ab, buf, len_tmp); len_rem -= len_tmp + 2; /* don't subtract the "2" because we still need * to add quotes to the remaining string */ len_abuf -= len_tmp; } len_buf -= len_tmp; buf += len_tmp; } /* ready to move to the next argument? */ if ((len_buf == 0) && !require_data) { arg++; iter = 0; len_full = 0; require_data = true; encode = false; } } while (arg < context->execve.argc); /* NOTE: the caller handles the final audit_log_end() call */ out: kfree(buf_head); } static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) { if (cap_isclear(*cap)) { audit_log_format(ab, " %s=0", prefix); return; } audit_log_format(ab, " %s=%016llx", prefix, cap->val); } static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) { if (name->fcap_ver == -1) { audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?"); return; } audit_log_cap(ab, "cap_fp", &name->fcap.permitted); audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d", name->fcap.fE, name->fcap_ver, from_kuid(&init_user_ns, name->fcap.rootid)); } static void audit_log_time(struct audit_context *context, struct audit_buffer **ab) { const struct audit_ntp_data *ntp = &context->time.ntp_data; const struct timespec64 *tk = &context->time.tk_injoffset; static const char * const ntp_name[] = { "offset", "freq", "status", "tai", "tick", "adjust", }; int type; if (context->type == AUDIT_TIME_ADJNTPVAL) { for (type = 0; type < AUDIT_NTP_NVALS; type++) { if (ntp->vals[type].newval != ntp->vals[type].oldval) { if (!*ab) { *ab = audit_log_start(context, GFP_KERNEL, AUDIT_TIME_ADJNTPVAL); if (!*ab) return; } audit_log_format(*ab, "op=%s old=%lli new=%lli", ntp_name[type], ntp->vals[type].oldval, ntp->vals[type].newval); audit_log_end(*ab); *ab = NULL; } } } if (tk->tv_sec != 0 || tk->tv_nsec != 0) { if (!*ab) { *ab = audit_log_start(context, GFP_KERNEL, AUDIT_TIME_INJOFFSET); if (!*ab) return; } audit_log_format(*ab, "sec=%lli nsec=%li", (long long)tk->tv_sec, tk->tv_nsec); audit_log_end(*ab); *ab = NULL; } } static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; int i; ab = audit_log_start(context, GFP_KERNEL, context->type); if (!ab) return; switch (context->type) { case AUDIT_SOCKETCALL: { int nargs = context->socketcall.nargs; audit_log_format(ab, "nargs=%d", nargs); for (i = 0; i < nargs; i++) audit_log_format(ab, " a%d=%lx", i, context->socketcall.args[i]); break; } case AUDIT_IPC: audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", from_kuid(&init_user_ns, context->ipc.uid), from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { char *ctx = NULL; u32 len; if (security_lsmprop_to_secctx(&context->ipc.oprop, &ctx, &len)) { *call_panic = 1; } else { audit_log_format(ab, " obj=%s", ctx); security_release_secctx(ctx, len); } } if (context->ipc.has_perm) { audit_log_end(ab); ab = audit_log_start(context, GFP_KERNEL, AUDIT_IPC_SET_PERM); if (unlikely(!ab)) return; audit_log_format(ab, "qbytes=%lx ouid=%u ogid=%u mode=%#ho", context->ipc.qbytes, context->ipc.perm_uid, context->ipc.perm_gid, context->ipc.perm_mode); } break; case AUDIT_MQ_OPEN: audit_log_format(ab, "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " "mq_msgsize=%ld mq_curmsgs=%ld", context->mq_open.oflag, context->mq_open.mode, context->mq_open.attr.mq_flags, context->mq_open.attr.mq_maxmsg, context->mq_open.attr.mq_msgsize, context->mq_open.attr.mq_curmsgs); break; case AUDIT_MQ_SENDRECV: audit_log_format(ab, "mqdes=%d msg_len=%zd msg_prio=%u " "abs_timeout_sec=%lld abs_timeout_nsec=%ld", context->mq_sendrecv.mqdes, context->mq_sendrecv.msg_len, context->mq_sendrecv.msg_prio, (long long) context->mq_sendrecv.abs_timeout.tv_sec, context->mq_sendrecv.abs_timeout.tv_nsec); break; case AUDIT_MQ_NOTIFY: audit_log_format(ab, "mqdes=%d sigev_signo=%d", context->mq_notify.mqdes, context->mq_notify.sigev_signo); break; case AUDIT_MQ_GETSETATTR: { struct mq_attr *attr = &context->mq_getsetattr.mqstat; audit_log_format(ab, "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " "mq_curmsgs=%ld ", context->mq_getsetattr.mqdes, attr->mq_flags, attr->mq_maxmsg, attr->mq_msgsize, attr->mq_curmsgs); break; } case AUDIT_CAPSET: audit_log_format(ab, "pid=%d", context->capset.pid); audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient); break; case AUDIT_MMAP: audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, context->mmap.flags); break; case AUDIT_OPENAT2: audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx", context->openat2.flags, context->openat2.mode, context->openat2.resolve); break; case AUDIT_EXECVE: audit_log_execve_info(context, &ab); break; case AUDIT_KERN_MODULE: audit_log_format(ab, "name="); if (context->module.name) { audit_log_untrustedstring(ab, context->module.name); } else audit_log_format(ab, "(null)"); break; case AUDIT_TIME_ADJNTPVAL: case AUDIT_TIME_INJOFFSET: /* this call deviates from the rest, eating the buffer */ audit_log_time(context, &ab); break; } audit_log_end(ab); } static inline int audit_proctitle_rtrim(char *proctitle, int len) { char *end = proctitle + len - 1; while (end > proctitle && !isprint(*end)) end--; /* catch the case where proctitle is only 1 non-print character */ len = end - proctitle + 1; len -= isprint(proctitle[len-1]) == 0; return len; } /* * audit_log_name - produce AUDIT_PATH record from struct audit_names * @context: audit_context for the task * @n: audit_names structure with reportable details * @path: optional path to report instead of audit_names->name * @record_num: record number to report when handling a list of names * @call_panic: optional pointer to int that will be updated if secid fails */ static void audit_log_name(struct audit_context *context, struct audit_names *n, const struct path *path, int record_num, int *call_panic) { struct audit_buffer *ab; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); if (!ab) return; audit_log_format(ab, "item=%d", record_num); if (path) audit_log_d_path(ab, " name=", path); else if (n->name) { switch (n->name_len) { case AUDIT_NAME_FULL: /* log the full path */ audit_log_format(ab, " name="); audit_log_untrustedstring(ab, n->name->name); break; case 0: /* name was specified as a relative path and the * directory component is the cwd */ if (context->pwd.dentry && context->pwd.mnt) audit_log_d_path(ab, " name=", &context->pwd); else audit_log_format(ab, " name=(null)"); break; default: /* log the name's directory component */ audit_log_format(ab, " name="); audit_log_n_untrustedstring(ab, n->name->name, n->name_len); } } else audit_log_format(ab, " name=(null)"); if (n->ino != AUDIT_INO_UNSET) audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", n->ino, MAJOR(n->dev), MINOR(n->dev), n->mode, from_kuid(&init_user_ns, n->uid), from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); if (lsmprop_is_set(&n->oprop)) { char *ctx = NULL; u32 len; if (security_lsmprop_to_secctx(&n->oprop, &ctx, &len)) { if (call_panic) *call_panic = 2; } else { audit_log_format(ab, " obj=%s", ctx); security_release_secctx(ctx, len); } } /* log the audit_names record type */ switch (n->type) { case AUDIT_TYPE_NORMAL: audit_log_format(ab, " nametype=NORMAL"); break; case AUDIT_TYPE_PARENT: audit_log_format(ab, " nametype=PARENT"); break; case AUDIT_TYPE_CHILD_DELETE: audit_log_format(ab, " nametype=DELETE"); break; case AUDIT_TYPE_CHILD_CREATE: audit_log_format(ab, " nametype=CREATE"); break; default: audit_log_format(ab, " nametype=UNKNOWN"); break; } audit_log_fcaps(ab, n); audit_log_end(ab); } static void audit_log_proctitle(void) { int res; char *buf; char *msg = "(null)"; int len = strlen(msg); struct audit_context *context = audit_context(); struct audit_buffer *ab; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ audit_log_format(ab, "proctitle="); /* Not cached */ if (!context->proctitle.value) { buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL); if (!buf) goto out; /* Historically called this from procfs naming */ res = get_cmdline(current, buf, MAX_PROCTITLE_AUDIT_LEN); if (res == 0) { kfree(buf); goto out; } res = audit_proctitle_rtrim(buf, res); if (res == 0) { kfree(buf); goto out; } context->proctitle.value = buf; context->proctitle.len = res; } msg = context->proctitle.value; len = context->proctitle.len; out: audit_log_n_untrustedstring(ab, msg, len); audit_log_end(ab); } /** * audit_log_uring - generate a AUDIT_URINGOP record * @ctx: the audit context */ static void audit_log_uring(struct audit_context *ctx) { struct audit_buffer *ab; const struct cred *cred; ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP); if (!ab) return; cred = current_cred(); audit_log_format(ab, "uring_op=%d", ctx->uring_op); if (ctx->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", str_yes_no(ctx->return_valid == AUDITSC_SUCCESS), ctx->return_code); audit_log_format(ab, " items=%d" " ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u" " fsuid=%u egid=%u sgid=%u fsgid=%u", ctx->name_count, task_ppid_nr(current), task_tgid_nr(current), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), from_kuid(&init_user_ns, cred->euid), from_kuid(&init_user_ns, cred->suid), from_kuid(&init_user_ns, cred->fsuid), from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid)); audit_log_task_context(ab); audit_log_key(ab, ctx->filterkey); audit_log_end(ab); } static void audit_log_exit(void) { int i, call_panic = 0; struct audit_context *context = audit_context(); struct audit_buffer *ab; struct audit_aux_data *aux; struct audit_names *n; context->personality = current->personality; switch (context->context) { case AUDIT_CTX_SYSCALL: ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); if (!ab) return; audit_log_format(ab, "arch=%x syscall=%d", context->arch, context->major); if (context->personality != PER_LINUX) audit_log_format(ab, " per=%lx", context->personality); if (context->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", str_yes_no(context->return_valid == AUDITSC_SUCCESS), context->return_code); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", context->argv[0], context->argv[1], context->argv[2], context->argv[3], context->name_count); audit_log_task_info(ab); audit_log_key(ab, context->filterkey); audit_log_end(ab); break; case AUDIT_CTX_URING: audit_log_uring(context); break; default: BUG(); break; } for (aux = context->aux; aux; aux = aux->next) { ab = audit_log_start(context, GFP_KERNEL, aux->type); if (!ab) continue; /* audit_panic has been called */ switch (aux->type) { case AUDIT_BPRM_FCAPS: { struct audit_aux_data_bprm_fcaps *axs = (void *)aux; audit_log_format(ab, "fver=%x", axs->fcap_ver); audit_log_cap(ab, "fp", &axs->fcap.permitted); audit_log_cap(ab, "fi", &axs->fcap.inheritable); audit_log_format(ab, " fe=%d", axs->fcap.fE); audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient); audit_log_cap(ab, "pp", &axs->new_pcap.permitted); audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); audit_log_cap(ab, "pe", &axs->new_pcap.effective); audit_log_cap(ab, "pa", &axs->new_pcap.ambient); audit_log_format(ab, " frootid=%d", from_kuid(&init_user_ns, axs->fcap.rootid)); break; } } audit_log_end(ab); } if (context->type) show_special(context, &call_panic); if (context->fds[0] >= 0) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR); if (ab) { audit_log_format(ab, "fd0=%d fd1=%d", context->fds[0], context->fds[1]); audit_log_end(ab); } } if (context->sockaddr_len) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); if (ab) { audit_log_format(ab, "saddr="); audit_log_n_hex(ab, (void *)context->sockaddr, context->sockaddr_len); audit_log_end(ab); } } for (aux = context->aux_pids; aux; aux = aux->next) { struct audit_aux_data_pids *axs = (void *)aux; for (i = 0; i < axs->pid_count; i++) if (audit_log_pid_context(context, axs->target_pid[i], axs->target_auid[i], axs->target_uid[i], axs->target_sessionid[i], &axs->target_ref[i], axs->target_comm[i])) call_panic = 1; } if (context->target_pid && audit_log_pid_context(context, context->target_pid, context->target_auid, context->target_uid, context->target_sessionid, &context->target_ref, context->target_comm)) call_panic = 1; if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { audit_log_d_path(ab, "cwd=", &context->pwd); audit_log_end(ab); } } i = 0; list_for_each_entry(n, &context->names_list, list) { if (n->hidden) continue; audit_log_name(context, n, NULL, i++, &call_panic); } if (context->context == AUDIT_CTX_SYSCALL) audit_log_proctitle(); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); if (ab) audit_log_end(ab); if (call_panic) audit_panic("error in audit_log_exit()"); } /** * __audit_free - free a per-task audit context * @tsk: task whose audit context block to free * * Called from copy_process, do_exit, and the io_uring code */ void __audit_free(struct task_struct *tsk) { struct audit_context *context = tsk->audit_context; if (!context) return; /* this may generate CONFIG_CHANGE records */ if (!list_empty(&context->killed_trees)) audit_kill_trees(context); /* We are called either by do_exit() or the fork() error handling code; * in the former case tsk == current and in the latter tsk is a * random task_struct that doesn't have any meaningful data we * need to log via audit_log_exit(). */ if (tsk == current && !context->dummy) { context->return_valid = AUDITSC_INVALID; context->return_code = 0; if (context->context == AUDIT_CTX_SYSCALL) { audit_filter_syscall(tsk, context); audit_filter_inodes(tsk, context); if (context->current_state == AUDIT_STATE_RECORD) audit_log_exit(); } else if (context->context == AUDIT_CTX_URING) { /* TODO: verify this case is real and valid */ audit_filter_uring(tsk, context); audit_filter_inodes(tsk, context); if (context->current_state == AUDIT_STATE_RECORD) audit_log_uring(context); } } audit_set_context(tsk, NULL); audit_free_context(context); } /** * audit_return_fixup - fixup the return codes in the audit_context * @ctx: the audit_context * @success: true/false value to indicate if the operation succeeded or not * @code: operation return code * * We need to fixup the return code in the audit logs if the actual return * codes are later going to be fixed by the arch specific signal handlers. */ static void audit_return_fixup(struct audit_context *ctx, int success, long code) { /* * This is actually a test for: * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) || * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK) * * but is faster than a bunch of || */ if (unlikely(code <= -ERESTARTSYS) && (code >= -ERESTART_RESTARTBLOCK) && (code != -ENOIOCTLCMD)) ctx->return_code = -EINTR; else ctx->return_code = code; ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE); } /** * __audit_uring_entry - prepare the kernel task's audit context for io_uring * @op: the io_uring opcode * * This is similar to audit_syscall_entry() but is intended for use by io_uring * operations. This function should only ever be called from * audit_uring_entry() as we rely on the audit context checking present in that * function. */ void __audit_uring_entry(u8 op) { struct audit_context *ctx = audit_context(); if (ctx->state == AUDIT_STATE_DISABLED) return; /* * NOTE: It's possible that we can be called from the process' context * before it returns to userspace, and before audit_syscall_exit() * is called. In this case there is not much to do, just record * the io_uring details and return. */ ctx->uring_op = op; if (ctx->context == AUDIT_CTX_SYSCALL) return; ctx->dummy = !audit_n_rules; if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD) ctx->prio = 0; ctx->context = AUDIT_CTX_URING; ctx->current_state = ctx->state; ktime_get_coarse_real_ts64(&ctx->ctime); } /** * __audit_uring_exit - wrap up the kernel task's audit context after io_uring * @success: true/false value to indicate if the operation succeeded or not * @code: operation return code * * This is similar to audit_syscall_exit() but is intended for use by io_uring * operations. This function should only ever be called from * audit_uring_exit() as we rely on the audit context checking present in that * function. */ void __audit_uring_exit(int success, long code) { struct audit_context *ctx = audit_context(); if (ctx->dummy) { if (ctx->context != AUDIT_CTX_URING) return; goto out; } audit_return_fixup(ctx, success, code); if (ctx->context == AUDIT_CTX_SYSCALL) { /* * NOTE: See the note in __audit_uring_entry() about the case * where we may be called from process context before we * return to userspace via audit_syscall_exit(). In this * case we simply emit a URINGOP record and bail, the * normal syscall exit handling will take care of * everything else. * It is also worth mentioning that when we are called, * the current process creds may differ from the creds * used during the normal syscall processing; keep that * in mind if/when we move the record generation code. */ /* * We need to filter on the syscall info here to decide if we * should emit a URINGOP record. I know it seems odd but this * solves the problem where users have a filter to block *all* * syscall records in the "exit" filter; we want to preserve * the behavior here. */ audit_filter_syscall(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) audit_filter_uring(current, ctx); audit_filter_inodes(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) return; audit_log_uring(ctx); return; } /* this may generate CONFIG_CHANGE records */ if (!list_empty(&ctx->killed_trees)) audit_kill_trees(ctx); /* run through both filters to ensure we set the filterkey properly */ audit_filter_uring(current, ctx); audit_filter_inodes(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) goto out; audit_log_exit(); out: audit_reset_context(ctx); } /** * __audit_syscall_entry - fill in an audit record at syscall entry * @major: major syscall type (function) * @a1: additional syscall register 1 * @a2: additional syscall register 2 * @a3: additional syscall register 3 * @a4: additional syscall register 4 * * Fill in audit context at syscall entry. This only happens if the * audit context was created when the task was created and the state or * filters demand the audit context be built. If the state from the * per-task filter or from the per-syscall filter is AUDIT_STATE_RECORD, * then the record will be written at syscall exit time (otherwise, it * will only be written if another part of the kernel requests that it * be written). */ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { struct audit_context *context = audit_context(); enum audit_state state; if (!audit_enabled || !context) return; WARN_ON(context->context != AUDIT_CTX_UNUSED); WARN_ON(context->name_count); if (context->context != AUDIT_CTX_UNUSED || context->name_count) { audit_panic("unrecoverable error in audit_syscall_entry()"); return; } state = context->state; if (state == AUDIT_STATE_DISABLED) return; context->dummy = !audit_n_rules; if (!context->dummy && state == AUDIT_STATE_BUILD) { context->prio = 0; if (auditd_test_task(current)) return; } context->arch = syscall_get_arch(current); context->major = major; context->argv[0] = a1; context->argv[1] = a2; context->argv[2] = a3; context->argv[3] = a4; context->context = AUDIT_CTX_SYSCALL; context->current_state = state; ktime_get_coarse_real_ts64(&context->ctime); } /** * __audit_syscall_exit - deallocate audit context after a system call * @success: success value of the syscall * @return_code: return value of the syscall * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_STATE_RECORD state from * filtering, or because some other part of the kernel wrote an audit * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ void __audit_syscall_exit(int success, long return_code) { struct audit_context *context = audit_context(); if (!context || context->dummy || context->context != AUDIT_CTX_SYSCALL) goto out; /* this may generate CONFIG_CHANGE records */ if (!list_empty(&context->killed_trees)) audit_kill_trees(context); audit_return_fixup(context, success, return_code); /* run through both filters to ensure we set the filterkey properly */ audit_filter_syscall(current, context); audit_filter_inodes(current, context); if (context->current_state != AUDIT_STATE_RECORD) goto out; audit_log_exit(); out: audit_reset_context(context); } static inline void handle_one(const struct inode *inode) { struct audit_context *context; struct audit_tree_refs *p; struct audit_chunk *chunk; int count; if (likely(!inode->i_fsnotify_marks)) return; context = audit_context(); p = context->trees; count = context->tree_count; rcu_read_lock(); chunk = audit_tree_lookup(inode); rcu_read_unlock(); if (!chunk) return; if (likely(put_tree_ref(context, chunk))) return; if (unlikely(!grow_tree_refs(context))) { pr_warn("out of memory, audit has lost a tree reference\n"); audit_set_auditable(context); audit_put_chunk(chunk); unroll_tree_refs(context, p, count); return; } put_tree_ref(context, chunk); } static void handle_path(const struct dentry *dentry) { struct audit_context *context; struct audit_tree_refs *p; const struct dentry *d, *parent; struct audit_chunk *drop; unsigned long seq; int count; context = audit_context(); p = context->trees; count = context->tree_count; retry: drop = NULL; d = dentry; rcu_read_lock(); seq = read_seqbegin(&rename_lock); for (;;) { struct inode *inode = d_backing_inode(d); if (inode && unlikely(inode->i_fsnotify_marks)) { struct audit_chunk *chunk; chunk = audit_tree_lookup(inode); if (chunk) { if (unlikely(!put_tree_ref(context, chunk))) { drop = chunk; break; } } } parent = d->d_parent; if (parent == d) break; d = parent; } if (unlikely(read_seqretry(&rename_lock, seq) || drop)) { /* in this order */ rcu_read_unlock(); if (!drop) { /* just a race with rename */ unroll_tree_refs(context, p, count); goto retry; } audit_put_chunk(drop); if (grow_tree_refs(context)) { /* OK, got more space */ unroll_tree_refs(context, p, count); goto retry; } /* too bad */ pr_warn("out of memory, audit has lost a tree reference\n"); unroll_tree_refs(context, p, count); audit_set_auditable(context); return; } rcu_read_unlock(); } static struct audit_names *audit_alloc_name(struct audit_context *context, unsigned char type) { struct audit_names *aname; if (context->name_count < AUDIT_NAMES) { aname = &context->preallocated_names[context->name_count]; memset(aname, 0, sizeof(*aname)); } else { aname = kzalloc(sizeof(*aname), GFP_NOFS); if (!aname) return NULL; aname->should_free = true; } aname->ino = AUDIT_INO_UNSET; aname->type = type; list_add_tail(&aname->list, &context->names_list); context->name_count++; if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); return aname; } /** * __audit_reusename - fill out filename with info from existing entry * @uptr: userland ptr to pathname * * Search the audit_names list for the current audit context. If there is an * existing entry with a matching "uptr" then return the filename * associated with that audit_name. If not, return NULL. */ struct filename * __audit_reusename(const __user char *uptr) { struct audit_context *context = audit_context(); struct audit_names *n; list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; if (n->name->uptr == uptr) { atomic_inc(&n->name->refcnt); return n->name; } } return NULL; } /** * __audit_getname - add a name to the list * @name: name to add * * Add a name to the list of audit names for this context. * Called from fs/namei.c:getname(). */ void __audit_getname(struct filename *name) { struct audit_context *context = audit_context(); struct audit_names *n; if (context->context == AUDIT_CTX_UNUSED) return; n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; atomic_inc(&name->refcnt); } static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) { struct cpu_vfs_cap_data caps; int rc; if (!dentry) return 0; rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps); if (rc) return rc; name->fcap.permitted = caps.permitted; name->fcap.inheritable = caps.inheritable; name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); name->fcap.rootid = caps.rootid; name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; return 0; } /* Copy inode data into an audit_names. */ static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, struct inode *inode, unsigned int flags) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; name->mode = inode->i_mode; name->uid = inode->i_uid; name->gid = inode->i_gid; name->rdev = inode->i_rdev; security_inode_getlsmprop(inode, &name->oprop); if (flags & AUDIT_INODE_NOEVAL) { name->fcap_ver = -1; return; } audit_copy_fcaps(name, dentry); } /** * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited * @flags: attributes for this particular entry */ void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; struct audit_entry *e; struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; int i; if (context->context == AUDIT_CTX_UNUSED) return; rcu_read_lock(); list_for_each_entry_rcu(e, list, list) { for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; if (f->type == AUDIT_FSTYPE && audit_comparator(inode->i_sb->s_magic, f->op, f->val) && e->rule.action == AUDIT_NEVER) { rcu_read_unlock(); return; } } } rcu_read_unlock(); if (!name) goto out_alloc; /* * If we have a pointer to an audit_names entry already, then we can * just use it directly if the type is correct. */ n = name->aname; if (n) { if (parent) { if (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) goto out; } else { if (n->type != AUDIT_TYPE_PARENT) goto out; } } list_for_each_entry_reverse(n, &context->names_list, list) { if (n->ino) { /* valid inode number, use that for the comparison */ if (n->ino != inode->i_ino || n->dev != inode->i_sb->s_dev) continue; } else if (n->name) { /* inode number has not been set, check the name */ if (strcmp(n->name->name, name->name)) continue; } else /* no inode and no name (?!) ... this is odd ... */ continue; /* match the correct record type */ if (parent) { if (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) goto out; } else { if (n->type != AUDIT_TYPE_PARENT) goto out; } } out_alloc: /* unable to find an entry with both a matching name and type */ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; if (name) { n->name = name; atomic_inc(&name->refcnt); } out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; n->type = AUDIT_TYPE_PARENT; if (flags & AUDIT_INODE_HIDDEN) n->hidden = true; } else { n->name_len = AUDIT_NAME_FULL; n->type = AUDIT_TYPE_NORMAL; } handle_path(dentry); audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL); } void __audit_file(const struct file *file) { __audit_inode(NULL, file->f_path.dentry, 0); } /** * __audit_inode_child - collect inode info for created/removed objects * @parent: inode of dentry parent * @dentry: dentry being audited * @type: AUDIT_TYPE_* value that we're looking for * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. * This call updates the audit context with the child's information. * Syscalls that create a new filesystem object must be hooked after * the object is created. Syscalls that remove a filesystem object * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); const struct qstr *dname = &dentry->d_name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; struct audit_entry *e; struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; int i; if (context->context == AUDIT_CTX_UNUSED) return; rcu_read_lock(); list_for_each_entry_rcu(e, list, list) { for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; if (f->type == AUDIT_FSTYPE && audit_comparator(parent->i_sb->s_magic, f->op, f->val) && e->rule.action == AUDIT_NEVER) { rcu_read_unlock(); return; } } } rcu_read_unlock(); if (inode) handle_one(inode); /* look for a parent entry first */ list_for_each_entry(n, &context->names_list, list) { if (!n->name || (n->type != AUDIT_TYPE_PARENT && n->type != AUDIT_TYPE_UNKNOWN)) continue; if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev && !audit_compare_dname_path(dname, n->name->name, n->name_len)) { if (n->type == AUDIT_TYPE_UNKNOWN) n->type = AUDIT_TYPE_PARENT; found_parent = n; break; } } cond_resched(); /* is there a matching child entry? */ list_for_each_entry(n, &context->names_list, list) { /* can only match entries that have a name */ if (!n->name || (n->type != type && n->type != AUDIT_TYPE_UNKNOWN)) continue; if (!strcmp(dname->name, n->name->name) || !audit_compare_dname_path(dname, n->name->name, found_parent ? found_parent->name_len : AUDIT_NAME_FULL)) { if (n->type == AUDIT_TYPE_UNKNOWN) n->type = type; found_child = n; break; } } if (!found_parent) { /* create a new, "anonymous" parent record */ n = audit_alloc_name(context, AUDIT_TYPE_PARENT); if (!n) return; audit_copy_inode(n, NULL, parent, 0); } if (!found_child) { found_child = audit_alloc_name(context, type); if (!found_child) return; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; atomic_inc(&found_child->name->refcnt); } } if (inode) audit_copy_inode(found_child, dentry, inode, 0); else found_child->ino = AUDIT_INO_UNSET; } EXPORT_SYMBOL_GPL(__audit_inode_child); /** * auditsc_get_stamp - get local copies of audit_context values * @ctx: audit_context for the task * @t: timespec64 to store time recorded in the audit_context * @serial: serial value that is recorded in the audit_context * * Also sets the context as auditable. */ int auditsc_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (ctx->context == AUDIT_CTX_UNUSED) return 0; if (!ctx->serial) ctx->serial = audit_serial(); t->tv_sec = ctx->ctime.tv_sec; t->tv_nsec = ctx->ctime.tv_nsec; *serial = ctx->serial; if (!ctx->prio) { ctx->prio = 1; ctx->current_state = AUDIT_STATE_RECORD; } return 1; } /** * __audit_mq_open - record audit data for a POSIX MQ open * @oflag: open flag * @mode: mode bits * @attr: queue attributes * */ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { struct audit_context *context = audit_context(); if (attr) memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr)); else memset(&context->mq_open.attr, 0, sizeof(struct mq_attr)); context->mq_open.oflag = oflag; context->mq_open.mode = mode; context->type = AUDIT_MQ_OPEN; } /** * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive * @mqdes: MQ descriptor * @msg_len: Message length * @msg_prio: Message priority * @abs_timeout: Message timeout in absolute time * */ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { struct audit_context *context = audit_context(); struct timespec64 *p = &context->mq_sendrecv.abs_timeout; if (abs_timeout) memcpy(p, abs_timeout, sizeof(*p)); else memset(p, 0, sizeof(*p)); context->mq_sendrecv.mqdes = mqdes; context->mq_sendrecv.msg_len = msg_len; context->mq_sendrecv.msg_prio = msg_prio; context->type = AUDIT_MQ_SENDRECV; } /** * __audit_mq_notify - record audit data for a POSIX MQ notify * @mqdes: MQ descriptor * @notification: Notification event * */ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { struct audit_context *context = audit_context(); if (notification) context->mq_notify.sigev_signo = notification->sigev_signo; else context->mq_notify.sigev_signo = 0; context->mq_notify.mqdes = mqdes; context->type = AUDIT_MQ_NOTIFY; } /** * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute * @mqdes: MQ descriptor * @mqstat: MQ flags * */ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { struct audit_context *context = audit_context(); context->mq_getsetattr.mqdes = mqdes; context->mq_getsetattr.mqstat = *mqstat; context->type = AUDIT_MQ_GETSETATTR; } /** * __audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions * */ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) { struct audit_context *context = audit_context(); context->ipc.uid = ipcp->uid; context->ipc.gid = ipcp->gid; context->ipc.mode = ipcp->mode; context->ipc.has_perm = 0; security_ipc_getlsmprop(ipcp, &context->ipc.oprop); context->type = AUDIT_IPC; } /** * __audit_ipc_set_perm - record audit data for new ipc permissions * @qbytes: msgq bytes * @uid: msgq user id * @gid: msgq group id * @mode: msgq mode (permissions) * * Called only after audit_ipc_obj(). */ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { struct audit_context *context = audit_context(); context->ipc.qbytes = qbytes; context->ipc.perm_uid = uid; context->ipc.perm_gid = gid; context->ipc.perm_mode = mode; context->ipc.has_perm = 1; } void __audit_bprm(struct linux_binprm *bprm) { struct audit_context *context = audit_context(); context->type = AUDIT_EXECVE; context->execve.argc = bprm->argc; } /** * __audit_socketcall - record audit data for sys_socketcall * @nargs: number of args, which should not be more than AUDITSC_ARGS. * @args: args array * */ int __audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = audit_context(); if (nargs <= 0 || nargs > AUDITSC_ARGS || !args) return -EINVAL; context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); return 0; } /** * __audit_fd_pair - record audit data for pipe and socketpair * @fd1: the first file descriptor * @fd2: the second file descriptor * */ void __audit_fd_pair(int fd1, int fd2) { struct audit_context *context = audit_context(); context->fds[0] = fd1; context->fds[1] = fd2; } /** * __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto * @len: data length in user space * @a: data address in kernel space * * Returns 0 for success or NULL context or < 0 on error. */ int __audit_sockaddr(int len, void *a) { struct audit_context *context = audit_context(); if (!context->sockaddr) { void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); if (!p) return -ENOMEM; context->sockaddr = p; } context->sockaddr_len = len; memcpy(context->sockaddr, a, len); return 0; } void __audit_ptrace(struct task_struct *t) { struct audit_context *context = audit_context(); context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); strscpy(context->target_comm, t->comm); security_task_getlsmprop_obj(t, &context->target_ref); } /** * audit_signal_info_syscall - record signal info for syscalls * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ int audit_signal_info_syscall(struct task_struct *t) { struct audit_aux_data_pids *axp; struct audit_context *ctx = audit_context(); kuid_t t_uid = task_uid(t); if (!audit_signals || audit_dummy_context()) return 0; /* optimize the common case by putting first signal recipient directly * in audit_context */ if (!ctx->target_pid) { ctx->target_pid = task_tgid_nr(t); ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); strscpy(ctx->target_comm, t->comm); security_task_getlsmprop_obj(t, &ctx->target_ref); return 0; } axp = (void *)ctx->aux_pids; if (!axp || axp->pid_count == AUDIT_AUX_PIDS) { axp = kzalloc(sizeof(*axp), GFP_ATOMIC); if (!axp) return -ENOMEM; axp->d.type = AUDIT_OBJ_PID; axp->d.next = ctx->aux_pids; ctx->aux_pids = (void *)axp; } BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); axp->target_pid[axp->pid_count] = task_tgid_nr(t); axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); security_task_getlsmprop_obj(t, &axp->target_ref[axp->pid_count]); strscpy(axp->target_comm[axp->pid_count], t->comm); axp->pid_count++; return 0; } /** * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps * @bprm: pointer to the bprm being processed * @new: the proposed new credentials * @old: the old credentials * * Simply check if the proc already has the caps given by the file and if not * store the priv escalation info for later auditing at the end of the syscall * * -Eric */ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { struct audit_aux_data_bprm_fcaps *ax; struct audit_context *context = audit_context(); struct cpu_vfs_cap_data vcaps; ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; ax->d.type = AUDIT_BPRM_FCAPS; ax->d.next = context->aux; context->aux = (void *)ax; get_vfs_caps_from_disk(&nop_mnt_idmap, bprm->file->f_path.dentry, &vcaps); ax->fcap.permitted = vcaps.permitted; ax->fcap.inheritable = vcaps.inheritable; ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); ax->fcap.rootid = vcaps.rootid; ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; ax->old_pcap.permitted = old->cap_permitted; ax->old_pcap.inheritable = old->cap_inheritable; ax->old_pcap.effective = old->cap_effective; ax->old_pcap.ambient = old->cap_ambient; ax->new_pcap.permitted = new->cap_permitted; ax->new_pcap.inheritable = new->cap_inheritable; ax->new_pcap.effective = new->cap_effective; ax->new_pcap.ambient = new->cap_ambient; return 0; } /** * __audit_log_capset - store information about the arguments to the capset syscall * @new: the new credentials * @old: the old (current) credentials * * Record the arguments userspace sent to sys_capset for later printing by the * audit system if applicable */ void __audit_log_capset(const struct cred *new, const struct cred *old) { struct audit_context *context = audit_context(); context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; context->capset.cap.ambient = new->cap_ambient; context->type = AUDIT_CAPSET; } void __audit_mmap_fd(int fd, int flags) { struct audit_context *context = audit_context(); context->mmap.fd = fd; context->mmap.flags = flags; context->type = AUDIT_MMAP; } void __audit_openat2_how(struct open_how *how) { struct audit_context *context = audit_context(); context->openat2.flags = how->flags; context->openat2.mode = how->mode; context->openat2.resolve = how->resolve; context->type = AUDIT_OPENAT2; } void __audit_log_kern_module(char *name) { struct audit_context *context = audit_context(); context->module.name = kstrdup(name, GFP_KERNEL); if (!context->module.name) audit_log_lost("out of memory in __audit_log_kern_module"); context->type = AUDIT_KERN_MODULE; } void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */ switch (friar->hdr.type) { case FAN_RESPONSE_INFO_NONE: audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2", response, FAN_RESPONSE_INFO_NONE); break; case FAN_RESPONSE_INFO_AUDIT_RULE: audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u", response, friar->hdr.type, friar->rule_number, friar->subj_trust, friar->obj_trust); } } void __audit_tk_injoffset(struct timespec64 offset) { struct audit_context *context = audit_context(); /* only set type if not already set by NTP */ if (!context->type) context->type = AUDIT_TIME_INJOFFSET; memcpy(&context->time.tk_injoffset, &offset, sizeof(offset)); } void __audit_ntp_log(const struct audit_ntp_data *ad) { struct audit_context *context = audit_context(); int type; for (type = 0; type < AUDIT_NTP_NVALS; type++) if (ad->vals[type].newval != ad->vals[type].oldval) { /* unconditionally set type, overwriting TK */ context->type = AUDIT_TIME_ADJNTPVAL; memcpy(&context->time.ntp_data, ad, sizeof(*ad)); break; } } void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { struct audit_buffer *ab; char comm[sizeof(current->comm)]; ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG); if (!ab) return; audit_log_format(ab, "table=%s family=%u entries=%u op=%s", name, af, nentries, audit_nfcfgs[op].s); audit_log_format(ab, " pid=%u", task_tgid_nr(current)); audit_log_task_context(ab); /* subj= */ audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_end(ab); } EXPORT_SYMBOL_GPL(__audit_log_nfcfg); static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; unsigned int sessionid; char comm[sizeof(current->comm)]; auid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); current_uid_gid(&uid, &gid); audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); } /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value * * If a process ends with a core dump, something fishy is going on and we * should record the event for investigation. */ void audit_core_dumps(long signr) { struct audit_buffer *ab; if (!audit_enabled) return; if (signr == SIGQUIT) /* don't care for those */ return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_ABEND); if (unlikely(!ab)) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld res=1", signr); audit_log_end(ab); } /** * audit_seccomp - record information about a seccomp action * @syscall: syscall number * @signr: signal value * @code: the seccomp action * * Record the information associated with a seccomp action. Event filtering for * seccomp actions that are not to be logged is done in seccomp_log(). * Therefore, this function forces auditing independent of the audit_enabled * and dummy context state because seccomp actions should be logged even when * audit is not in use. */ void audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SECCOMP); if (unlikely(!ab)) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", signr, syscall_get_arch(current), syscall, in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } void audit_seccomp_actions_logged(const char *names, const char *old_names, int res) { struct audit_buffer *ab; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; audit_log_format(ab, "op=seccomp-logging actions=%s old-actions=%s res=%d", names, old_names, res); audit_log_end(ab); } struct list_head *audit_killed_trees(void) { struct audit_context *ctx = audit_context(); if (likely(!ctx || ctx->context == AUDIT_CTX_UNUSED)) return NULL; return &ctx->killed_trees; }
52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 // SPDX-License-Identifier: GPL-2.0 /* * sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem. * * Changes: * YOSHIFUJI Hideaki @USAGI: added icmp sysctl table. */ #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ndisc.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/inet_frag.h> #include <net/netevent.h> #include <net/ip_fib.h> #ifdef CONFIG_NETLABEL #include <net/calipso.h> #endif #include <linux/ioam6.h> static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static u32 rt6_multipath_hash_fields_all_mask = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; static u32 ioam6_id_max = IOAM6_DEFAULT_ID; static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE; static int proc_rt6_multipath_hash_policy(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv6.sysctl.multipath_hash_policy); ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); return ret; } static int proc_rt6_multipath_hash_fields(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv6.sysctl.multipath_hash_fields); ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); return ret; } static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", .data = &init_net.ipv6.sysctl.bindv6only, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "anycast_src_echo_reply", .data = &init_net.ipv6.sysctl.anycast_src_echo_reply, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_consistency", .data = &init_net.ipv6.sysctl.flowlabel_consistency, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "auto_flowlabels", .data = &init_net.ipv6.sysctl.auto_flowlabels, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &auto_flowlabels_max }, { .procname = "fwmark_reflect", .data = &init_net.ipv6.sysctl.fwmark_reflect, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "idgen_retries", .data = &init_net.ipv6.sysctl.idgen_retries, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "idgen_delay", .data = &init_net.ipv6.sysctl.idgen_delay, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "flowlabel_state_ranges", .data = &init_net.ipv6.sysctl.flowlabel_state_ranges, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv6.sysctl.ip_nonlocal_bind, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_reflect", .data = &init_net.ipv6.sysctl.flowlabel_reflect, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &flowlabel_reflect_max, }, { .procname = "max_dst_opts_number", .data = &init_net.ipv6.sysctl.max_dst_opts_cnt, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_hbh_opts_number", .data = &init_net.ipv6.sysctl.max_hbh_opts_cnt, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_dst_opts_length", .data = &init_net.ipv6.sysctl.max_dst_opts_len, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_hbh_length", .data = &init_net.ipv6.sysctl.max_hbh_opts_len, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv6.sysctl.multipath_hash_policy, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "fib_multipath_hash_fields", .data = &init_net.ipv6.sysctl.multipath_hash_fields, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_rt6_multipath_hash_fields, .extra1 = SYSCTL_ONE, .extra2 = &rt6_multipath_hash_fields_all_mask, }, { .procname = "seg6_flowlabel", .data = &init_net.ipv6.sysctl.seg6_flowlabel, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv6.sysctl.fib_notify_on_flag_change, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "ioam6_id", .data = &init_net.ipv6.sysctl.ioam6_id, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra2 = &ioam6_id_max, }, { .procname = "ioam6_id_wide", .data = &init_net.ipv6.sysctl.ioam6_id_wide, .maxlen = sizeof(u64), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra2 = &ioam6_id_wide_max, }, }; static struct ctl_table ipv6_rotable[] = { { .procname = "mld_max_msf", .data = &sysctl_mld_max_msf, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "mld_qrv", .data = &sysctl_mld_qrv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, #ifdef CONFIG_NETLABEL { .procname = "calipso_cache_enable", .data = &calipso_cache_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "calipso_cache_bucket_size", .data = &calipso_cache_bucketsize, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif /* CONFIG_NETLABEL */ }; static int __net_init ipv6_sysctl_net_init(struct net *net) { size_t table_size = ARRAY_SIZE(ipv6_table_template); struct ctl_table *ipv6_table; struct ctl_table *ipv6_route_table; struct ctl_table *ipv6_icmp_table; int err, i; err = -ENOMEM; ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template), GFP_KERNEL); if (!ipv6_table) goto out; /* Update the variables to point into the current struct net */ for (i = 0; i < table_size; i++) ipv6_table[i].data += (void *)net - (void *)&init_net; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) goto out_ipv6_table; ipv6_icmp_table = ipv6_icmp_sysctl_init(net); if (!ipv6_icmp_table) goto out_ipv6_route_table; net->ipv6.sysctl.hdr = register_net_sysctl_sz(net, "net/ipv6", ipv6_table, table_size); if (!net->ipv6.sysctl.hdr) goto out_ipv6_icmp_table; net->ipv6.sysctl.route_hdr = register_net_sysctl_sz(net, "net/ipv6/route", ipv6_route_table, ipv6_route_sysctl_table_size(net)); if (!net->ipv6.sysctl.route_hdr) goto out_unregister_ipv6_table; net->ipv6.sysctl.icmp_hdr = register_net_sysctl_sz(net, "net/ipv6/icmp", ipv6_icmp_table, ipv6_icmp_sysctl_table_size()); if (!net->ipv6.sysctl.icmp_hdr) goto out_unregister_route_table; err = 0; out: return err; out_unregister_route_table: unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr); out_unregister_ipv6_table: unregister_net_sysctl_table(net->ipv6.sysctl.hdr); out_ipv6_icmp_table: kfree(ipv6_icmp_table); out_ipv6_route_table: kfree(ipv6_route_table); out_ipv6_table: kfree(ipv6_table); goto out; } static void __net_exit ipv6_sysctl_net_exit(struct net *net) { const struct ctl_table *ipv6_table; const struct ctl_table *ipv6_route_table; const struct ctl_table *ipv6_icmp_table; ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg; ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg; ipv6_icmp_table = net->ipv6.sysctl.icmp_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.icmp_hdr); unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr); unregister_net_sysctl_table(net->ipv6.sysctl.hdr); kfree(ipv6_table); kfree(ipv6_route_table); kfree(ipv6_icmp_table); } static struct pernet_operations ipv6_sysctl_net_ops = { .init = ipv6_sysctl_net_init, .exit = ipv6_sysctl_net_exit, }; static struct ctl_table_header *ip6_header; int ipv6_sysctl_register(void) { int err = -ENOMEM; ip6_header = register_net_sysctl(&init_net, "net/ipv6", ipv6_rotable); if (!ip6_header) goto out; err = register_pernet_subsys(&ipv6_sysctl_net_ops); if (err) goto err_pernet; out: return err; err_pernet: unregister_net_sysctl_table(ip6_header); goto out; } void ipv6_sysctl_unregister(void) { unregister_net_sysctl_table(ip6_header); unregister_pernet_subsys(&ipv6_sysctl_net_ops); }
17 17 387 1309 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/writeback.h */ #ifndef WRITEBACK_H #define WRITEBACK_H #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/fs.h> #include <linux/flex_proportions.h> #include <linux/backing-dev-defs.h> #include <linux/blk_types.h> #include <linux/pagevec.h> struct bio; DECLARE_PER_CPU(int, dirty_throttle_leaks); /* * The global dirty threshold is normally equal to the global dirty limit, * except when the system suddenly allocates a lot of anonymous memory and * knocks down the global dirty threshold quickly, in which case the global * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. */ #define DIRTY_SCOPE 8 struct backing_dev_info; /* * fs/fs-writeback.c */ enum writeback_sync_modes { WB_SYNC_NONE, /* Don't wait on anything */ WB_SYNC_ALL, /* Wait on every mapping */ }; /* * A control structure which tells the writeback code what to do. These are * always on the stack, and hence need no locking. They are always initialised * in a manner such that unspecified fields are set to zero. */ struct writeback_control { /* public fields that can be set and/or consumed by the caller: */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ /* * For a_ops->writepages(): if start or end are non-zero then this is * a hint that the filesystem need only write out the pages inside that * byterange. The byte at `end' is included in the writeout request. */ loff_t range_start; loff_t range_end; enum writeback_sync_modes sync_mode; unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_background:1; /* A background writeback */ unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned unpinned_netfs_wb:1; /* Cleared I_PINNING_NETFS_WB */ /* * When writeback IOs are bounced through async layers, only the * initial synchronous phase should be accounted towards inode * cgroup ownership arbitration to avoid confusion. Later stages * can set the following flag to disable the accounting. */ unsigned no_cgroup_owner:1; /* To enable batching of swap writes to non-block-device backends, * "plug" can be set point to a 'struct swap_iocb *'. When all swap * writes have been submitted, if with swap_iocb is not NULL, * swap_write_unplug() should be called. */ struct swap_iocb **swap_plug; /* Target list for splitting a large folio */ struct list_head *list; /* internal fields used by the ->writepages implementation: */ struct folio_batch fbatch; pgoff_t index; int saved_err; #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ /* foreign inode detection, see wbc_detach_inode() */ int wb_id; /* current wb id */ int wb_lcand_id; /* last foreign candidate wb id */ int wb_tcand_id; /* this foreign candidate wb id */ size_t wb_bytes; /* bytes written by current wb */ size_t wb_lcand_bytes; /* bytes written by last candidate */ size_t wb_tcand_bytes; /* bytes written by this candidate */ #endif }; static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc) { blk_opf_t flags = 0; if (wbc->sync_mode == WB_SYNC_ALL) flags |= REQ_SYNC; else if (wbc->for_kupdate || wbc->for_background) flags |= REQ_BACKGROUND; return flags; } #ifdef CONFIG_CGROUP_WRITEBACK #define wbc_blkcg_css(wbc) \ ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css) #else #define wbc_blkcg_css(wbc) (blkcg_root_css) #endif /* CONFIG_CGROUP_WRITEBACK */ /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to * and are measured against each other in. There always is one global * domain, global_wb_domain, that every wb in the system is a member of. * This allows measuring the relative bandwidth of each wb to distribute * dirtyable memory accordingly. */ struct wb_domain { spinlock_t lock; /* * Scale the writeback cache size proportional to the relative * writeout speed. * * We do this by keeping a floating proportion between BDIs, based * on page writeback completions [end_page_writeback()]. Those * devices that write out pages fastest will get the larger share, * while the slower will get a smaller share. * * We use page writeout completions because we are interested in * getting rid of dirty pages. Having them written out is the * primary goal. * * We introduce a concept of time, a period over which we measure * these events, because demand can/will vary over time. The length * of this period itself is measured in page writeback completions. */ struct fprop_global completions; struct timer_list period_timer; /* timer for aging of completions */ unsigned long period_time; /* * The dirtyable memory and dirty threshold could be suddenly * knocked down by a large amount (eg. on the startup of KVM in a * swapless system). This may throw the system into deep dirty * exceeded state and throttle heavy/light dirtiers alike. To * retain good responsiveness, maintain global_dirty_limit for * tracking slowly down to the knocked down dirty threshold. * * Both fields are protected by ->lock. */ unsigned long dirty_limit_tstamp; unsigned long dirty_limit; }; /** * wb_domain_size_changed - memory available to a wb_domain has changed * @dom: wb_domain of interest * * This function should be called when the amount of memory available to * @dom has changed. It resets @dom's dirty limit parameters to prevent * the past values which don't match the current configuration from skewing * dirty throttling. Without this, when memory size of a wb_domain is * greatly reduced, the dirty throttling logic may allow too many pages to * be dirtied leading to consecutive unnecessary OOMs and may get stuck in * that situation. */ static inline void wb_domain_size_changed(struct wb_domain *dom) { spin_lock(&dom->lock); dom->dirty_limit_tstamp = jiffies; dom->dirty_limit = 0; spin_unlock(&dom->lock); } /* * fs/fs-writeback.c */ struct bdi_writeback; void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(enum wb_reason reason); void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) { wait_var_event(inode_state_wait_address(inode, __I_NEW), !(READ_ONCE(inode->i_state) & I_NEW)); } #ifdef CONFIG_CGROUP_WRITEBACK #include <linux/cgroup.h> #include <linux/bio.h> void __inode_attach_wb(struct inode *inode, struct folio *folio); void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes); int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(struct super_block *sb); bool cleanup_offline_cgwb(struct bdi_writeback *wb); /** * inode_attach_wb - associate an inode with its wb * @inode: inode of interest * @folio: folio being dirtied (may be NULL) * * If @inode doesn't have its wb, associate it with the wb matching the * memcg of @folio or, if @folio is NULL, %current. May be called w/ or w/o * @inode->i_lock. */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { if (!inode->i_wb) __inode_attach_wb(inode, folio); } /** * inode_detach_wb - disassociate an inode from its wb * @inode: inode of interest * * @inode is being freed. Detach from its wb. */ static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } } void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode); /** * wbc_init_bio - writeback specific initializtion of bio * @wbc: writeback_control for the writeback in progress * @bio: bio to be initialized * * @bio is a part of the writeback in progress controlled by @wbc. Perform * writeback specific initialization. This is used to apply the cgroup * writeback context. Must be called after the bio has been associated with * a device. */ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { /* * pageout() path doesn't attach @wbc to the inode being written * out. This is intentional as we don't want the function to block * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ if (wbc->wb) bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { } static inline void inode_detach_wb(struct inode *inode) { } static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) { } static inline void wbc_detach_inode(struct writeback_control *wbc) { } static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { } static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes) { } static inline void cgroup_writeback_umount(struct super_block *sb) { } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * mm/page-writeback.c */ void laptop_io_completion(struct backing_dev_info *info); void laptop_sync_completion(void); void laptop_mode_timer_fn(struct timer_list *t); bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom); #endif extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; extern unsigned int dirtytime_expire_interval; extern int laptop_mode; int dirtytime_interval_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); unsigned long cgwb_calc_thresh(struct bdi_writeback *wb); void wb_update_bandwidth(struct bdi_writeback *wb); /* Invoke balance dirty pages in async mode. */ #define BDP_ASYNC 0x0001 void balance_dirty_pages_ratelimited(struct address_space *mapping); int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, unsigned int flags); bool wb_over_bg_thresh(struct bdi_writeback *wb); struct folio *writeback_iter(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio, int *error); typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, void *data); int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio); bool folio_redirty_for_writepage(struct writeback_control *, struct folio *); bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); #endif /* WRITEBACK_H */
2 35 34 2 4 3 30 3 2 2 3 3 27 35 34 1 84 63 1 63 63 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 // SPDX-License-Identifier: GPL-2.0 #include <linux/anon_inodes.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/cgroup.h> #include <linux/magic.h> #include <linux/mount.h> #include <linux/pid.h> #include <linux/pidfs.h> #include <linux/pid_namespace.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/pseudo_fs.h> #include <linux/ptrace.h> #include <linux/seq_file.h> #include <uapi/linux/pidfd.h> #include <linux/ipc_namespace.h> #include <linux/time_namespace.h> #include <linux/utsname.h> #include <net/net_namespace.h> #include "internal.h" #include "mount.h" #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd * @m: proc fdinfo file * @f: file referencing a pidfd * * Pid: * This function will print the pid that a given pidfd refers to in the * pid namespace of the procfs instance. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its pid. This is * similar to calling getppid() on a process whose parent is outside of * its pid namespace. * * NSpid: * If pid namespaces are supported then this function will also print * the pid of a given pidfd refers to for all descendant pid namespaces * starting from the current pid namespace of the instance, i.e. the * Pid field and the first entry in the NSpid field will be identical. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its first NSpid * entry and no others will be shown. * Note that this differs from the Pid and NSpid fields in * /proc/<pid>/status where Pid and NSpid are always shown relative to * the pid namespace of the procfs instance. The difference becomes * obvious when sending around a pidfd between pid namespaces from a * different branch of the tree, i.e. where no ancestral relation is * present between the pid namespaces: * - create two new pid namespaces ns1 and ns2 in the initial pid * namespace (also take care to create new mount namespaces in the * new pid namespace and mount procfs) * - create a process with a pidfd in ns1 * - send pidfd from ns1 to ns2 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid * have exactly one entry, which is 0 */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { struct pid *pid = pidfd_pid(f); struct pid_namespace *ns; pid_t nr = -1; if (likely(pid_has_task(pid, PIDTYPE_PID))) { ns = proc_pid_ns(file_inode(m->file)->i_sb); nr = pid_nr_ns(pid, ns); } seq_put_decimal_ll(m, "Pid:\t", nr); #ifdef CONFIG_PID_NS seq_put_decimal_ll(m, "\nNSpid:\t", nr); if (nr > 0) { int i; /* If nr is non-zero it means that 'pid' is valid and that * ns, i.e. the pid namespace associated with the procfs * instance, is in the pid namespace hierarchy of pid. * Start at one below the already printed level. */ for (i = ns->level + 1; i <= pid->level; i++) seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); } #endif seq_putc(m, '\n'); } #endif /* * Poll support for process exit notification. */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct pid *pid = pidfd_pid(file); bool thread = file->f_flags & PIDFD_THREAD; struct task_struct *task; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); /* * Depending on PIDFD_THREAD, inform pollers when the thread * or the whole thread-group exits. */ guard(rcu)(); task = pid_task(pid, PIDTYPE_PID); if (!task) poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; else if (task->exit_state && (thread || thread_group_empty(task))) poll_flags = EPOLLIN | EPOLLRDNORM; return poll_flags; } static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; struct user_namespace *user_ns; const struct cred *c; __u64 mask; #ifdef CONFIG_CGROUPS struct cgroup *cgrp; #endif if (!uinfo) return -EINVAL; if (usize < PIDFD_INFO_SIZE_VER0) return -EINVAL; /* First version, no smaller struct possible */ if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) return -EFAULT; c = get_task_cred(task); if (!c) return -ESRCH; /* Unconditionally return identifiers and credentials, the rest only on request */ user_ns = current_user_ns(); kinfo.ruid = from_kuid_munged(user_ns, c->uid); kinfo.rgid = from_kgid_munged(user_ns, c->gid); kinfo.euid = from_kuid_munged(user_ns, c->euid); kinfo.egid = from_kgid_munged(user_ns, c->egid); kinfo.suid = from_kuid_munged(user_ns, c->suid); kinfo.sgid = from_kgid_munged(user_ns, c->sgid); kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); kinfo.mask |= PIDFD_INFO_CREDS; put_cred(c); #ifdef CONFIG_CGROUPS rcu_read_lock(); cgrp = task_dfl_cgroup(task); kinfo.cgroupid = cgroup_id(cgrp); kinfo.mask |= PIDFD_INFO_CGROUPID; rcu_read_unlock(); #endif /* * Copy pid/tgid last, to reduce the chances the information might be * stale. Note that it is not possible to ensure it will be valid as the * task might return as soon as the copy_to_user finishes, but that's ok * and userspace expects that might happen and can act accordingly, so * this is just best-effort. What we can do however is checking that all * the fields are set correctly, or return ESRCH to avoid providing * incomplete information. */ kinfo.ppid = task_ppid_nr_ns(task, NULL); kinfo.tgid = task_tgid_vnr(task); kinfo.pid = task_pid_vnr(task); kinfo.mask |= PIDFD_INFO_PID; if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) return -ESRCH; /* * If userspace and the kernel have the same struct size it can just * be copied. If userspace provides an older struct, only the bits that * userspace knows about will be copied. If userspace provides a new * struct, only the bits that the kernel knows about will be copied. */ if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo)))) return -EFAULT; return 0; } static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct task_struct *task __free(put_task) = NULL; struct nsproxy *nsp __free(put_nsproxy) = NULL; struct pid *pid = pidfd_pid(file); struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) return pidfd_info(task, cmd, arg); if (arg) return -EINVAL; scoped_guard(task_lock, task) { nsp = task->nsproxy; if (nsp) get_nsproxy(nsp); } if (!nsp) return -ESRCH; /* just pretend it didn't exist */ /* * We're trying to open a file descriptor to the namespace so perform a * filesystem cred ptrace check. Also, we mirror nsfs behavior. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) return -EACCES; switch (cmd) { /* Namespaces that hang of nsproxy. */ case PIDFD_GET_CGROUP_NAMESPACE: if (IS_ENABLED(CONFIG_CGROUPS)) { get_cgroup_ns(nsp->cgroup_ns); ns_common = to_ns_common(nsp->cgroup_ns); } break; case PIDFD_GET_IPC_NAMESPACE: if (IS_ENABLED(CONFIG_IPC_NS)) { get_ipc_ns(nsp->ipc_ns); ns_common = to_ns_common(nsp->ipc_ns); } break; case PIDFD_GET_MNT_NAMESPACE: get_mnt_ns(nsp->mnt_ns); ns_common = to_ns_common(nsp->mnt_ns); break; case PIDFD_GET_NET_NAMESPACE: if (IS_ENABLED(CONFIG_NET_NS)) { ns_common = to_ns_common(nsp->net_ns); get_net_ns(ns_common); } break; case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: if (IS_ENABLED(CONFIG_PID_NS)) { get_pid_ns(nsp->pid_ns_for_children); ns_common = to_ns_common(nsp->pid_ns_for_children); } break; case PIDFD_GET_TIME_NAMESPACE: if (IS_ENABLED(CONFIG_TIME_NS)) { get_time_ns(nsp->time_ns); ns_common = to_ns_common(nsp->time_ns); } break; case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: if (IS_ENABLED(CONFIG_TIME_NS)) { get_time_ns(nsp->time_ns_for_children); ns_common = to_ns_common(nsp->time_ns_for_children); } break; case PIDFD_GET_UTS_NAMESPACE: if (IS_ENABLED(CONFIG_UTS_NS)) { get_uts_ns(nsp->uts_ns); ns_common = to_ns_common(nsp->uts_ns); } break; /* Namespaces that don't hang of nsproxy. */ case PIDFD_GET_USER_NAMESPACE: if (IS_ENABLED(CONFIG_USER_NS)) { rcu_read_lock(); ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); rcu_read_unlock(); } break; case PIDFD_GET_PID_NAMESPACE: if (IS_ENABLED(CONFIG_PID_NS)) { rcu_read_lock(); pid_ns = task_active_pid_ns(task); if (pid_ns) ns_common = to_ns_common(get_pid_ns(pid_ns)); rcu_read_unlock(); } break; default: return -ENOIOCTLCMD; } if (!ns_common) return -EOPNOTSUPP; /* open_namespace() unconditionally consumes the reference */ return open_namespace(ns_common); } static const struct file_operations pidfs_file_operations = { .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif .unlocked_ioctl = pidfd_ioctl, .compat_ioctl = compat_ptr_ioctl, }; struct pid *pidfd_pid(const struct file *file) { if (file->f_op != &pidfs_file_operations) return ERR_PTR(-EBADF); return file_inode(file)->i_private; } static struct vfsmount *pidfs_mnt __ro_after_init; #if BITS_PER_LONG == 32 /* * Provide a fallback mechanism for 32-bit systems so processes remain * reliably comparable by inode number even on those systems. */ static DEFINE_IDA(pidfd_inum_ida); static int pidfs_inum(struct pid *pid, unsigned long *ino) { int ret; ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, UINT_MAX, GFP_ATOMIC); if (ret < 0) return -ENOSPC; *ino = ret; return 0; } static inline void pidfs_free_inum(unsigned long ino) { if (ino > 0) ida_free(&pidfd_inum_ida, ino); } #else static inline int pidfs_inum(struct pid *pid, unsigned long *ino) { *ino = pid->ino; return 0; } #define pidfs_free_inum(ino) ((void)(ino)) #endif /* * The vfs falls back to simple_setattr() if i_op->setattr() isn't * implemented. Let's reject it completely until we have a clean * permission concept for pidfds. */ static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } /* * User space expects pidfs inodes to have no file type in st_mode. * * In particular, 'lsof' has this legacy logic: * * type = s->st_mode & S_IFMT; * switch (type) { * ... * case 0: * if (!strcmp(p, "anon_inode")) * Lf->ntype = Ntype = N_ANON_INODE; * * to detect our old anon_inode logic. * * Rather than mess with our internal sane inode data, just fix it * up here in getattr() by masking off the format bits. */ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->mode &= ~S_IFMT; return 0; } static const struct inode_operations pidfs_inode_operations = { .getattr = pidfs_getattr, .setattr = pidfs_setattr, }; static void pidfs_evict_inode(struct inode *inode) { struct pid *pid = inode->i_private; clear_inode(inode); put_pid(pid); pidfs_free_inum(inode->i_ino); } static const struct super_operations pidfs_sops = { .drop_inode = generic_delete_inode, .evict_inode = pidfs_evict_inode, .statfs = simple_statfs, }; /* * 'lsof' has knowledge of out historical anon_inode use, and expects * the pidfs dentry name to start with 'anon_inode'. */ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); } static const struct dentry_operations pidfs_dentry_operations = { .d_delete = always_delete_dentry, .d_dname = pidfs_dname, .d_prune = stashed_dentry_prune, }; static int pidfs_init_inode(struct inode *inode, void *data) { inode->i_private = data; inode->i_flags |= S_PRIVATE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; /* * Inode numbering for pidfs start at RESERVED_PIDS + 1. This * avoids collisions with the root inode which is 1 for pseudo * filesystems. */ return pidfs_inum(data, &inode->i_ino); } static void pidfs_put_data(void *data) { struct pid *pid = data; put_pid(pid); } static const struct stashed_operations pidfs_stashed_ops = { .init_inode = pidfs_init_inode, .put_data = pidfs_put_data, }; static int pidfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx; ctx = init_pseudo(fc, PID_FS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &pidfs_sops; ctx->dops = &pidfs_dentry_operations; fc->s_fs_info = (void *)&pidfs_stashed_ops; return 0; } static struct file_system_type pidfs_type = { .name = "pidfs", .init_fs_context = pidfs_init_fs_context, .kill_sb = kill_anon_super, }; struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) { struct file *pidfd_file; struct path path; int ret; ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) return ERR_PTR(ret); pidfd_file = dentry_open(&path, flags, current_cred()); path_put(&path); return pidfd_file; } void __init pidfs_init(void) { pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); }
244 219 254 221 91 91 4 221 250 220 250 253 152 140 139 4 236 5 5 238 243 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 Intel Corp. * * This file is part of the SCTP kernel implementation * * The base lksctp header. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Xingang Guo <xingang.guo@intel.com> * Jon Grimm <jgrimm@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Ryan Layer <rmlayer@us.ibm.com> * Kevin Gao <kevin.gao@intel.com> */ #ifndef __net_sctp_h__ #define __net_sctp_h__ /* Header Strategy. * Start getting some control over the header file dependencies: * includes * constants * structs * prototypes * macros, externs, and inlines * * Move test_frame specific items out of the kernel headers * and into the test frame headers. This is not perfect in any sense * and will continue to evolve. */ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/tty.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/jiffies.h> #include <linux/idr.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_route.h> #endif #include <linux/uaccess.h> #include <asm/page.h> #include <net/sock.h> #include <net/snmp.h> #include <net/sctp/structs.h> #include <net/sctp/constants.h> #ifdef CONFIG_IP_SCTP_MODULE #define SCTP_PROTOSW_FLAG 0 #else /* static! */ #define SCTP_PROTOSW_FLAG INET_PROTOSW_PERMANENT #endif /* * Function declarations. */ /* * sctp/protocol.c */ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *addr, enum sctp_scope, gfp_t gfp, int flags); struct sctp_pf *sctp_get_pf_specific(sa_family_t family); int sctp_register_pf(struct sctp_pf *, sa_family_t); void sctp_addr_wq_mgmt(struct net *, struct sctp_sockaddr_entry *, int); int sctp_udp_sock_start(struct net *net); void sctp_udp_sock_stop(struct net *net); /* * sctp/socket.c */ int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); void sctp_data_ready(struct sock *sk); __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct sctp_association *asoc); extern struct percpu_counter sctp_sockets_allocated; int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int *); typedef int (*sctp_callback_t)(struct sctp_endpoint *, struct sctp_transport *, void *); void sctp_transport_walk_start(struct rhashtable_iter *iter); void sctp_transport_walk_stop(struct rhashtable_iter *iter); struct sctp_transport *sctp_transport_get_next(struct net *net, struct rhashtable_iter *iter); struct sctp_transport *sctp_transport_get_idx(struct net *net, struct rhashtable_iter *iter, int pos); int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, void *p, int dif); int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done, struct net *net, int *pos, void *p); int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p); int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, struct sctp_info *info); /* * sctp/primitive.c */ int sctp_primitive_ASSOCIATE(struct net *, struct sctp_association *, void *arg); int sctp_primitive_SHUTDOWN(struct net *, struct sctp_association *, void *arg); int sctp_primitive_ABORT(struct net *, struct sctp_association *, void *arg); int sctp_primitive_SEND(struct net *, struct sctp_association *, void *arg); int sctp_primitive_REQUESTHEARTBEAT(struct net *, struct sctp_association *, void *arg); int sctp_primitive_ASCONF(struct net *, struct sctp_association *, void *arg); int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc, void *arg); /* * sctp/input.c */ int sctp_rcv(struct sk_buff *skb); int sctp_v4_err(struct sk_buff *skb, u32 info); int sctp_hash_endpoint(struct sctp_endpoint *ep); void sctp_unhash_endpoint(struct sctp_endpoint *); struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *, struct sctphdr *, struct sctp_association **, struct sctp_transport **); void sctp_err_finish(struct sock *, struct sctp_transport *); int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb); int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb); void sctp_icmp_frag_needed(struct sock *, struct sctp_association *, struct sctp_transport *t, __u32 pmtu); void sctp_icmp_redirect(struct sock *, struct sctp_transport *, struct sk_buff *); void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t); int sctp_transport_hashtable_init(void); void sctp_transport_hashtable_destroy(void); int sctp_hash_transport(struct sctp_transport *t); void sctp_unhash_transport(struct sctp_transport *t); struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif); struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr); bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif); /* * sctp/proc.c */ int __net_init sctp_proc_init(struct net *net); /* * sctp/offload.c */ int sctp_offload_init(void); /* * sctp/stream_sched.c */ void sctp_sched_ops_init(void); /* * sctp/stream.c */ int sctp_send_reset_streams(struct sctp_association *asoc, struct sctp_reset_streams *params); int sctp_send_reset_assoc(struct sctp_association *asoc); int sctp_send_add_streams(struct sctp_association *asoc, struct sctp_add_streams *params); /* * Module global variables */ /* * sctp/protocol.c */ extern struct kmem_cache *sctp_chunk_cachep __read_mostly; extern struct kmem_cache *sctp_bucket_cachep __read_mostly; extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; /* * Section: Macros, externs, and inlines */ /* SCTP SNMP MIB stats handlers */ #define SCTP_INC_STATS(net, field) SNMP_INC_STATS((net)->sctp.sctp_statistics, field) #define __SCTP_INC_STATS(net, field) __SNMP_INC_STATS((net)->sctp.sctp_statistics, field) #define SCTP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->sctp.sctp_statistics, field) /* sctp mib definitions */ enum { SCTP_MIB_NUM = 0, SCTP_MIB_CURRESTAB, /* CurrEstab */ SCTP_MIB_ACTIVEESTABS, /* ActiveEstabs */ SCTP_MIB_PASSIVEESTABS, /* PassiveEstabs */ SCTP_MIB_ABORTEDS, /* Aborteds */ SCTP_MIB_SHUTDOWNS, /* Shutdowns */ SCTP_MIB_OUTOFBLUES, /* OutOfBlues */ SCTP_MIB_CHECKSUMERRORS, /* ChecksumErrors */ SCTP_MIB_OUTCTRLCHUNKS, /* OutCtrlChunks */ SCTP_MIB_OUTORDERCHUNKS, /* OutOrderChunks */ SCTP_MIB_OUTUNORDERCHUNKS, /* OutUnorderChunks */ SCTP_MIB_INCTRLCHUNKS, /* InCtrlChunks */ SCTP_MIB_INORDERCHUNKS, /* InOrderChunks */ SCTP_MIB_INUNORDERCHUNKS, /* InUnorderChunks */ SCTP_MIB_FRAGUSRMSGS, /* FragUsrMsgs */ SCTP_MIB_REASMUSRMSGS, /* ReasmUsrMsgs */ SCTP_MIB_OUTSCTPPACKS, /* OutSCTPPacks */ SCTP_MIB_INSCTPPACKS, /* InSCTPPacks */ SCTP_MIB_T1_INIT_EXPIREDS, SCTP_MIB_T1_COOKIE_EXPIREDS, SCTP_MIB_T2_SHUTDOWN_EXPIREDS, SCTP_MIB_T3_RTX_EXPIREDS, SCTP_MIB_T4_RTO_EXPIREDS, SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS, SCTP_MIB_DELAY_SACK_EXPIREDS, SCTP_MIB_AUTOCLOSE_EXPIREDS, SCTP_MIB_T1_RETRANSMITS, SCTP_MIB_T3_RETRANSMITS, SCTP_MIB_PMTUD_RETRANSMITS, SCTP_MIB_FAST_RETRANSMITS, SCTP_MIB_IN_PKT_SOFTIRQ, SCTP_MIB_IN_PKT_BACKLOG, SCTP_MIB_IN_PKT_DISCARDS, SCTP_MIB_IN_DATA_CHUNK_DISCARDS, __SCTP_MIB_MAX }; #define SCTP_MIB_MAX __SCTP_MIB_MAX struct sctp_mib { unsigned long mibs[SCTP_MIB_MAX]; }; /* helper function to track stats about max rto and related transport */ static inline void sctp_max_rto(struct sctp_association *asoc, struct sctp_transport *trans) { if (asoc->stats.max_obs_rto < (__u64)trans->rto) { asoc->stats.max_obs_rto = trans->rto; memset(&asoc->stats.obs_rto_ipaddr, 0, sizeof(struct sockaddr_storage)); memcpy(&asoc->stats.obs_rto_ipaddr, &trans->ipaddr, trans->af_specific->sockaddr_len); } } /* * Macros for keeping a global reference of object allocations. */ #ifdef CONFIG_SCTP_DBG_OBJCNT extern atomic_t sctp_dbg_objcnt_sock; extern atomic_t sctp_dbg_objcnt_ep; extern atomic_t sctp_dbg_objcnt_assoc; extern atomic_t sctp_dbg_objcnt_transport; extern atomic_t sctp_dbg_objcnt_chunk; extern atomic_t sctp_dbg_objcnt_bind_addr; extern atomic_t sctp_dbg_objcnt_bind_bucket; extern atomic_t sctp_dbg_objcnt_addr; extern atomic_t sctp_dbg_objcnt_datamsg; extern atomic_t sctp_dbg_objcnt_keys; /* Macros to atomically increment/decrement objcnt counters. */ #define SCTP_DBG_OBJCNT_INC(name) \ atomic_inc(&sctp_dbg_objcnt_## name) #define SCTP_DBG_OBJCNT_DEC(name) \ atomic_dec(&sctp_dbg_objcnt_## name) #define SCTP_DBG_OBJCNT(name) \ atomic_t sctp_dbg_objcnt_## name = ATOMIC_INIT(0) /* Macro to help create new entries in the global array of * objcnt counters. */ #define SCTP_DBG_OBJCNT_ENTRY(name) \ {.label= #name, .counter= &sctp_dbg_objcnt_## name} void sctp_dbg_objcnt_init(struct net *); #else #define SCTP_DBG_OBJCNT_INC(name) #define SCTP_DBG_OBJCNT_DEC(name) static inline void sctp_dbg_objcnt_init(struct net *net) { return; } #endif /* CONFIG_SCTP_DBG_OBJCOUNT */ #if defined CONFIG_SYSCTL void sctp_sysctl_register(void); void sctp_sysctl_unregister(void); int sctp_sysctl_net_register(struct net *net); void sctp_sysctl_net_unregister(struct net *net); #else static inline void sctp_sysctl_register(void) { return; } static inline void sctp_sysctl_unregister(void) { return; } static inline int sctp_sysctl_net_register(struct net *net) { return 0; } static inline void sctp_sysctl_net_unregister(struct net *net) { return; } #endif /* Size of Supported Address Parameter for 'x' address types. */ #define SCTP_SAT_LEN(x) (sizeof(struct sctp_paramhdr) + (x) * sizeof(__u16)) #if IS_ENABLED(CONFIG_IPV6) void sctp_v6_pf_init(void); void sctp_v6_pf_exit(void); int sctp_v6_protosw_init(void); void sctp_v6_protosw_exit(void); int sctp_v6_add_protocol(void); void sctp_v6_del_protocol(void); #else /* #ifdef defined(CONFIG_IPV6) */ static inline void sctp_v6_pf_init(void) { return; } static inline void sctp_v6_pf_exit(void) { return; } static inline int sctp_v6_protosw_init(void) { return 0; } static inline void sctp_v6_protosw_exit(void) { return; } static inline int sctp_v6_add_protocol(void) { return 0; } static inline void sctp_v6_del_protocol(void) { return; } #endif /* #if defined(CONFIG_IPV6) */ /* Map an association to an assoc_id. */ static inline sctp_assoc_t sctp_assoc2id(const struct sctp_association *asoc) { return asoc ? asoc->assoc_id : 0; } static inline enum sctp_sstat_state sctp_assoc_to_state(const struct sctp_association *asoc) { /* SCTP's uapi always had SCTP_EMPTY(=0) as a dummy state, but we * got rid of it in kernel space. Therefore SCTP_CLOSED et al * start at =1 in user space, but actually as =0 in kernel space. * Now that we can not break user space and SCTP_EMPTY is exposed * there, we need to fix it up with an ugly offset not to break * applications. :( */ return asoc->state + 1; } /* Look up the association by its id. */ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id); int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp); /* A macro to walk a list of skbs. */ #define sctp_skb_for_each(pos, head, tmp) \ skb_queue_walk_safe(head, pos, tmp) /** * sctp_list_dequeue - remove from the head of the queue * @list: list to dequeue from * * Remove the head of the list. The head item is * returned or %NULL if the list is empty. */ static inline struct list_head *sctp_list_dequeue(struct list_head *list) { struct list_head *result = NULL; if (!list_empty(list)) { result = list->next; list_del_init(result); } return result; } /* SCTP version of skb_set_owner_r. We need this one because * of the way we have to do receive buffer accounting on bundled * chunks. */ static inline void sctp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { struct sctp_ulpevent *event = sctp_skb2event(skb); skb_orphan(skb); skb->sk = sk; skb->destructor = sctp_sock_rfree; atomic_add(event->rmem_len, &sk->sk_rmem_alloc); /* * This mimics the behavior of skb_set_owner_r */ sk_mem_charge(sk, event->rmem_len); } /* Tests if the list has one and only one entry. */ static inline int sctp_list_single_entry(struct list_head *head) { return list_is_singular(head); } static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk) { return !list_empty(&chunk->list); } /* Walk through a list of TLV parameters. Don't trust the * individual parameter lengths and instead depend on * the chunk length to indicate when to stop. Make sure * there is room for a param header too. */ #define sctp_walk_params(pos, chunk)\ _sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length)) #define _sctp_walk_params(pos, chunk, end)\ for (pos.v = (u8 *)(chunk + 1);\ (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <=\ (void *)chunk + end) &&\ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\ ntohs(pos.p->length) >= sizeof(struct sctp_paramhdr);\ pos.v += SCTP_PAD4(ntohs(pos.p->length))) #define sctp_walk_errors(err, chunk_hdr)\ _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length)) #define _sctp_walk_errors(err, chunk_hdr, end)\ for (err = (struct sctp_errhdr *)((void *)chunk_hdr + \ sizeof(struct sctp_chunkhdr));\ ((void *)err + offsetof(struct sctp_errhdr, length) + sizeof(err->length) <=\ (void *)chunk_hdr + end) &&\ (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\ ntohs(err->length) >= sizeof(struct sctp_errhdr); \ err = (struct sctp_errhdr *)((void *)err + SCTP_PAD4(ntohs(err->length)))) #define sctp_walk_fwdtsn(pos, chunk)\ _sctp_walk_fwdtsn((pos), (chunk), ntohs((chunk)->chunk_hdr->length) - sizeof(struct sctp_fwdtsn_chunk)) #define _sctp_walk_fwdtsn(pos, chunk, end)\ for (pos = (void *)(chunk->subh.fwdtsn_hdr + 1);\ (void *)pos <= (void *)(chunk->subh.fwdtsn_hdr + 1) + end - sizeof(struct sctp_fwdtsn_skip);\ pos++) /* External references. */ extern struct proto sctp_prot; extern struct proto sctpv6_prot; void sctp_put_port(struct sock *sk); extern struct idr sctp_assocs_id; extern spinlock_t sctp_assocs_id_lock; /* Static inline functions. */ /* Convert from an IP version number to an Address Family symbol. */ static inline int ipver2af(__u8 ipver) { switch (ipver) { case 4: return AF_INET; case 6: return AF_INET6; default: return 0; } } /* Convert from an address parameter type to an address family. */ static inline int param_type2af(__be16 type) { switch (type) { case SCTP_PARAM_IPV4_ADDRESS: return AF_INET; case SCTP_PARAM_IPV6_ADDRESS: return AF_INET6; default: return 0; } } /* Warning: The following hash functions assume a power of two 'size'. */ /* This is the hash function for the SCTP port hash table. */ static inline int sctp_phashfn(struct net *net, __u16 lport) { return (net_hash_mix(net) + lport) & (sctp_port_hashsize - 1); } /* This is the hash function for the endpoint hash table. */ static inline int sctp_ep_hashfn(struct net *net, __u16 lport) { return (net_hash_mix(net) + lport) & (sctp_ep_hashsize - 1); } #define sctp_for_each_hentry(ep, head) \ hlist_for_each_entry(ep, head, node) /* Is a socket of this style? */ #define sctp_style(sk, style) __sctp_style((sk), (SCTP_SOCKET_##style)) static inline int __sctp_style(const struct sock *sk, enum sctp_socket_type style) { return sctp_sk(sk)->type == style; } /* Is the association in this state? */ #define sctp_state(asoc, state) __sctp_state((asoc), (SCTP_STATE_##state)) static inline int __sctp_state(const struct sctp_association *asoc, enum sctp_state state) { return asoc->state == state; } /* Is the socket in this state? */ #define sctp_sstate(sk, state) __sctp_sstate((sk), (SCTP_SS_##state)) static inline int __sctp_sstate(const struct sock *sk, enum sctp_sock_state state) { return sk->sk_state == state; } /* Map v4-mapped v6 address back to v4 address */ static inline void sctp_v6_map_v4(union sctp_addr *addr) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = addr->v6.sin6_port; addr->v4.sin_addr.s_addr = addr->v6.sin6_addr.s6_addr32[3]; } /* Map v4 address to v4-mapped v6 address */ static inline void sctp_v4_map_v6(union sctp_addr *addr) { __be16 port; port = addr->v4.sin_port; addr->v6.sin6_addr.s6_addr32[3] = addr->v4.sin_addr.s_addr; addr->v6.sin6_port = port; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_scope_id = 0; addr->v6.sin6_addr.s6_addr32[0] = 0; addr->v6.sin6_addr.s6_addr32[1] = 0; addr->v6.sin6_addr.s6_addr32[2] = htonl(0x0000ffff); } /* The cookie is always 0 since this is how it's used in the * pmtu code. */ static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t) { if (t->dst && !dst_check(t->dst, t->dst_cookie)) sctp_transport_dst_release(t); return t->dst; } /* Calculate max payload size given a MTU, or the total overhead if * given MTU is zero */ static inline __u32 __sctp_mtu_payload(const struct sctp_sock *sp, const struct sctp_transport *t, __u32 mtu, __u32 extra) { __u32 overhead = sizeof(struct sctphdr) + extra; if (sp) { overhead += sp->pf->af->net_header_len; if (sp->udp_port && (!t || t->encap_port)) overhead += sizeof(struct udphdr); } else { overhead += sizeof(struct ipv6hdr); } if (WARN_ON_ONCE(mtu && mtu <= overhead)) mtu = overhead; return mtu ? mtu - overhead : overhead; } static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp, __u32 mtu, __u32 extra) { return __sctp_mtu_payload(sp, NULL, mtu, extra); } static inline __u32 sctp_dst_mtu(const struct dst_entry *dst) { return SCTP_TRUNC4(max_t(__u32, dst_mtu(dst), SCTP_DEFAULT_MINSEGMENT)); } static inline bool sctp_transport_pmtu_check(struct sctp_transport *t) { __u32 pmtu = sctp_dst_mtu(t->dst); if (t->pathmtu == pmtu) return true; t->pathmtu = pmtu; return false; } static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize) { return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize); } static inline int sctp_transport_pl_hlen(struct sctp_transport *t) { return __sctp_mtu_payload(sctp_sk(t->asoc->base.sk), t, 0, 0) - sizeof(struct sctphdr); } static inline void sctp_transport_pl_reset(struct sctp_transport *t) { if (t->probe_interval && (t->param_flags & SPP_PMTUD_ENABLE) && (t->state == SCTP_ACTIVE || t->state == SCTP_UNKNOWN)) { if (t->pl.state == SCTP_PL_DISABLED) { t->pl.state = SCTP_PL_BASE; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pl.probe_size = SCTP_BASE_PLPMTU; sctp_transport_reset_probe_timer(t); } } else { if (t->pl.state != SCTP_PL_DISABLED) { if (del_timer(&t->probe_timer)) sctp_transport_put(t); t->pl.state = SCTP_PL_DISABLED; } } } static inline void sctp_transport_pl_update(struct sctp_transport *t) { if (t->pl.state == SCTP_PL_DISABLED) return; t->pl.state = SCTP_PL_BASE; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pl.probe_size = SCTP_BASE_PLPMTU; sctp_transport_reset_probe_timer(t); } static inline bool sctp_transport_pl_enabled(struct sctp_transport *t) { return t->pl.state != SCTP_PL_DISABLED; } static inline bool sctp_newsk_ready(const struct sock *sk) { return sock_flag(sk, SOCK_DEAD) || sk->sk_socket; } static inline void sctp_sock_set_nodelay(struct sock *sk) { lock_sock(sk); sctp_sk(sk)->nodelay = true; release_sock(sk); } #endif /* __net_sctp_h__ */
5 5 5 5 5 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2015 Intel Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/hci_mon.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie, u16 opcode, u16 len, void *buf) { struct hci_mon_hdr *hdr; struct sk_buff *skb; skb = bt_skb_alloc(6 + len, GFP_ATOMIC); if (!skb) return NULL; put_unaligned_le32(cookie, skb_put(skb, 4)); put_unaligned_le16(opcode, skb_put(skb, 2)); if (buf) skb_put_data(skb, buf, len); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); hdr->index = index; hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } struct sk_buff *mgmt_alloc_skb(struct hci_dev *hdev, u16 opcode, unsigned int size) { struct sk_buff *skb; skb = alloc_skb(sizeof(struct mgmt_hdr) + size, GFP_KERNEL); if (!skb) return skb; skb_reserve(skb, sizeof(struct mgmt_hdr)); bt_cb(skb)->mgmt.hdev = hdev; bt_cb(skb)->mgmt.opcode = opcode; return skb; } int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk) { struct hci_dev *hdev; struct mgmt_hdr *hdr; int len; if (!skb) return -EINVAL; len = skb->len; hdev = bt_cb(skb)->mgmt.hdev; /* Time stamp */ __net_timestamp(skb); /* Send just the data, without headers, to the monitor */ if (channel == HCI_CHANNEL_CONTROL) hci_send_monitor_ctrl_event(hdev, bt_cb(skb)->mgmt.opcode, skb->data, skb->len, skb_get_ktime(skb), flag, skip_sk); hdr = skb_push(skb, sizeof(*hdr)); hdr->opcode = cpu_to_le16(bt_cb(skb)->mgmt.opcode); if (hdev) hdr->index = cpu_to_le16(hdev->id); else hdr->index = cpu_to_le16(MGMT_INDEX_NONE); hdr->len = cpu_to_le16(len); hci_send_to_channel(channel, skb, flag, skip_sk); kfree_skb(skb); return 0; } int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, void *data, u16 data_len, int flag, struct sock *skip_sk) { struct sk_buff *skb; skb = mgmt_alloc_skb(hdev, event, data_len); if (!skb) return -ENOMEM; if (data) skb_put_data(skb, data, data_len); return mgmt_send_event_skb(channel, skb, flag, skip_sk); } int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) { struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_status *ev; int err; BT_DBG("sock %p, index %u, cmd %u, status %u", sk, index, cmd, status); skb = alloc_skb(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL); if (!skb) return -ENOMEM; hdr = skb_put(skb, sizeof(*hdr)); hdr->opcode = cpu_to_le16(MGMT_EV_CMD_STATUS); hdr->index = cpu_to_le16(index); hdr->len = cpu_to_le16(sizeof(*ev)); ev = skb_put(skb, sizeof(*ev)); ev->status = status; ev->opcode = cpu_to_le16(cmd); mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), MGMT_EV_CMD_STATUS, sizeof(*ev), ev); if (mskb) skb->tstamp = mskb->tstamp; else __net_timestamp(skb); err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); if (mskb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, HCI_SOCK_TRUSTED, NULL); kfree_skb(mskb); } return err; } int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, void *rp, size_t rp_len) { struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_complete *ev; int err; BT_DBG("sock %p", sk); skb = alloc_skb(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = skb_put(skb, sizeof(*hdr)); hdr->opcode = cpu_to_le16(MGMT_EV_CMD_COMPLETE); hdr->index = cpu_to_le16(index); hdr->len = cpu_to_le16(sizeof(*ev) + rp_len); ev = skb_put(skb, sizeof(*ev) + rp_len); ev->opcode = cpu_to_le16(cmd); ev->status = status; if (rp) memcpy(ev->data, rp, rp_len); mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), MGMT_EV_CMD_COMPLETE, sizeof(*ev) + rp_len, ev); if (mskb) skb->tstamp = mskb->tstamp; else __net_timestamp(skb); err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); if (mskb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, HCI_SOCK_TRUSTED, NULL); kfree_skb(mskb); } return err; } struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, struct hci_dev *hdev) { struct mgmt_pending_cmd *cmd; list_for_each_entry(cmd, &hdev->mgmt_pending, list) { if (hci_sock_get_channel(cmd->sk) != channel) continue; if (cmd->opcode == opcode) return cmd; } return NULL; } struct mgmt_pending_cmd *mgmt_pending_find_data(unsigned short channel, u16 opcode, struct hci_dev *hdev, const void *data) { struct mgmt_pending_cmd *cmd; list_for_each_entry(cmd, &hdev->mgmt_pending, list) { if (cmd->user_data != data) continue; if (cmd->opcode == opcode) return cmd; } return NULL; } void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, void (*cb)(struct mgmt_pending_cmd *cmd, void *data), void *data) { struct mgmt_pending_cmd *cmd, *tmp; list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { if (opcode > 0 && cmd->opcode != opcode) continue; cb(cmd, data); } } struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_pending_cmd *cmd; cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); if (!cmd) return NULL; cmd->opcode = opcode; cmd->index = hdev->id; cmd->param = kmemdup(data, len, GFP_KERNEL); if (!cmd->param) { kfree(cmd); return NULL; } cmd->param_len = len; cmd->sk = sk; sock_hold(sk); return cmd; } struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_pending_cmd *cmd; cmd = mgmt_pending_new(sk, opcode, hdev, data, len); if (!cmd) return NULL; list_add_tail(&cmd->list, &hdev->mgmt_pending); return cmd; } void mgmt_pending_free(struct mgmt_pending_cmd *cmd) { sock_put(cmd->sk); kfree(cmd->param); kfree(cmd); } void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) { list_del(&cmd->list); mgmt_pending_free(cmd); } void mgmt_mesh_foreach(struct hci_dev *hdev, void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), void *data, struct sock *sk) { struct mgmt_mesh_tx *mesh_tx, *tmp; list_for_each_entry_safe(mesh_tx, tmp, &hdev->mgmt_pending, list) { if (!sk || mesh_tx->sk == sk) cb(mesh_tx, data); } } struct mgmt_mesh_tx *mgmt_mesh_next(struct hci_dev *hdev, struct sock *sk) { struct mgmt_mesh_tx *mesh_tx; if (list_empty(&hdev->mesh_pending)) return NULL; list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) { if (!sk || mesh_tx->sk == sk) return mesh_tx; } return NULL; } struct mgmt_mesh_tx *mgmt_mesh_find(struct hci_dev *hdev, u8 handle) { struct mgmt_mesh_tx *mesh_tx; if (list_empty(&hdev->mesh_pending)) return NULL; list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) { if (mesh_tx->handle == handle) return mesh_tx; } return NULL; } struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mesh_tx *mesh_tx; mesh_tx = kzalloc(sizeof(*mesh_tx), GFP_KERNEL); if (!mesh_tx) return NULL; hdev->mesh_send_ref++; if (!hdev->mesh_send_ref) hdev->mesh_send_ref++; mesh_tx->handle = hdev->mesh_send_ref; mesh_tx->index = hdev->id; memcpy(mesh_tx->param, data, len); mesh_tx->param_len = len; mesh_tx->sk = sk; sock_hold(sk); list_add_tail(&mesh_tx->list, &hdev->mesh_pending); return mesh_tx; } void mgmt_mesh_remove(struct mgmt_mesh_tx *mesh_tx) { list_del(&mesh_tx->list); sock_put(mesh_tx->sk); kfree(mesh_tx); }
556 5 309 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 /* SPDX-License-Identifier: GPL-2.0 */ /* Freezer declarations */ #ifndef FREEZER_H_INCLUDED #define FREEZER_H_INCLUDED #include <linux/debug_locks.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/atomic.h> #include <linux/jump_label.h> #ifdef CONFIG_FREEZER DECLARE_STATIC_KEY_FALSE(freezer_active); extern bool pm_freezing; /* PM freezing in effect */ extern bool pm_nosig_freezing; /* PM nosig freezing in effect */ /* * Timeout for stopping processes */ extern unsigned int freeze_timeout_msecs; /* * Check if a process has been frozen */ extern bool frozen(struct task_struct *p); extern bool freezing_slow_path(struct task_struct *p); /* * Check if there is a request to freeze a process */ static inline bool freezing(struct task_struct *p) { if (static_branch_unlikely(&freezer_active)) return freezing_slow_path(p); return false; } /* Takes and releases task alloc lock using task_lock() */ extern void __thaw_task(struct task_struct *t); extern bool __refrigerator(bool check_kthr_stop); extern int freeze_processes(void); extern int freeze_kernel_threads(void); extern void thaw_processes(void); extern void thaw_kernel_threads(void); static inline bool try_to_freeze(void) { might_sleep(); if (likely(!freezing(current))) return false; if (!(current->flags & PF_NOFREEZE)) debug_check_no_locks_held(); return __refrigerator(false); } extern bool freeze_task(struct task_struct *p); extern bool set_freezable(void); #ifdef CONFIG_CGROUP_FREEZER extern bool cgroup_freezing(struct task_struct *task); #else /* !CONFIG_CGROUP_FREEZER */ static inline bool cgroup_freezing(struct task_struct *task) { return false; } #endif /* !CONFIG_CGROUP_FREEZER */ #else /* !CONFIG_FREEZER */ static inline bool frozen(struct task_struct *p) { return false; } static inline bool freezing(struct task_struct *p) { return false; } static inline void __thaw_task(struct task_struct *t) {} static inline bool __refrigerator(bool check_kthr_stop) { return false; } static inline int freeze_processes(void) { return -ENOSYS; } static inline int freeze_kernel_threads(void) { return -ENOSYS; } static inline void thaw_processes(void) {} static inline void thaw_kernel_threads(void) {} static inline bool try_to_freeze(void) { return false; } static inline void set_freezable(void) {} #endif /* !CONFIG_FREEZER */ #endif /* FREEZER_H_INCLUDED */
53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 /* * Copyright (c) 2001 The Regents of the University of Michigan. * All rights reserved. * * Kendrick Smith <kmsmith@umich.edu> * Andy Adamson <kandros@umich.edu> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include <linux/file.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/addr.h> #include <linux/jhash.h> #include <linux/string_helpers.h> #include <linux/fsnotify.h> #include <linux/rhashtable.h> #include <linux/nfs_ssc.h> #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" #include "current_stateid.h" #include "netns.h" #include "pnfs.h" #include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PROC #define all_ones {{ ~0, ~0}, ~0} static const stateid_t one_stateid = { .si_generation = ~0, .si_opaque = all_ones, }; static const stateid_t zero_stateid = { /* all fields zero */ }; static const stateid_t currentstateid = { .si_generation = 1, }; static const stateid_t close_stateid = { .si_generation = 0xffffffffU, }; static u64 current_sessionid = 1; #define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) #define CURRENT_STATEID(stateid) (!memcmp((stateid), &currentstateid, sizeof(stateid_t))) #define CLOSE_STATEID(stateid) (!memcmp((stateid), &close_stateid, sizeof(stateid_t))) /* forward declarations */ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); static void nfs4_free_ol_stateid(struct nfs4_stid *stid); void nfsd4_end_grace(struct nfsd_net *nn); static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps); static void nfsd4_file_hash_remove(struct nfs4_file *fi); static void deleg_reaper(struct nfsd_net *nn); /* Locking: */ /* * Currently used for the del_recall_lru and file hash table. In an * effort to decrease the scope of the client_mutex, this spinlock may * eventually cover more: */ static DEFINE_SPINLOCK(state_lock); enum nfsd4_st_mutex_lock_subclass { OPEN_STATEID_MUTEX = 0, LOCK_STATEID_MUTEX = 1, }; /* * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for * the refcount on the open stateid to drop. */ static DECLARE_WAIT_QUEUE_HEAD(close_wq); /* * A waitqueue where a writer to clients/#/ctl destroying a client can * wait for cl_rpc_users to drop to 0 and then for the client to be * unhashed. */ static DECLARE_WAIT_QUEUE_HEAD(expiry_wq); static struct kmem_cache *client_slab; static struct kmem_cache *openowner_slab; static struct kmem_cache *lockowner_slab; static struct kmem_cache *file_slab; static struct kmem_cache *stateid_slab; static struct kmem_cache *deleg_slab; static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops; static struct workqueue_struct *laundry_wq; int nfsd4_create_laundry_wq(void) { int rc = 0; laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); if (laundry_wq == NULL) rc = -ENOMEM; return rc; } void nfsd4_destroy_laundry_wq(void) { destroy_workqueue(laundry_wq); } static bool is_session_dead(struct nfsd4_session *ses) { return ses->se_dead; } static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me) { if (atomic_read(&ses->se_ref) > ref_held_by_me) return nfserr_jukebox; ses->se_dead = true; return nfs_ok; } static bool is_client_expired(struct nfs4_client *clp) { return clp->cl_time == 0; } static void nfsd4_dec_courtesy_client_count(struct nfsd_net *nn, struct nfs4_client *clp) { if (clp->cl_state != NFSD4_ACTIVE) atomic_add_unless(&nn->nfsd_courtesy_clients, -1, 0); } static __be32 get_client_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); if (is_client_expired(clp)) return nfserr_expired; atomic_inc(&clp->cl_rpc_users); nfsd4_dec_courtesy_client_count(nn, clp); clp->cl_state = NFSD4_ACTIVE; return nfs_ok; } /* must be called under the client_lock */ static inline void renew_client_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (is_client_expired(clp)) { WARN_ON(1); printk("%s: client (clientid %08x/%08x) already expired\n", __func__, clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); return; } list_move_tail(&clp->cl_lru, &nn->client_lru); clp->cl_time = ktime_get_boottime_seconds(); nfsd4_dec_courtesy_client_count(nn, clp); clp->cl_state = NFSD4_ACTIVE; } static void put_client_renew_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); if (!atomic_dec_and_test(&clp->cl_rpc_users)) return; if (!is_client_expired(clp)) renew_client_locked(clp); else wake_up_all(&expiry_wq); } static void put_client_renew(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (!atomic_dec_and_lock(&clp->cl_rpc_users, &nn->client_lock)) return; if (!is_client_expired(clp)) renew_client_locked(clp); else wake_up_all(&expiry_wq); spin_unlock(&nn->client_lock); } static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses) { __be32 status; if (is_session_dead(ses)) return nfserr_badsession; status = get_client_locked(ses->se_client); if (status) return status; atomic_inc(&ses->se_ref); return nfs_ok; } static void nfsd4_put_session_locked(struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses)) free_session(ses); put_client_renew_locked(clp); } static void nfsd4_put_session(struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); spin_lock(&nn->client_lock); nfsd4_put_session_locked(ses); spin_unlock(&nn->client_lock); } static struct nfsd4_blocked_lock * find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, struct nfsd_net *nn) { struct nfsd4_blocked_lock *cur, *found = NULL; spin_lock(&nn->blocked_locks_lock); list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { if (fh_match(fh, &cur->nbl_fh)) { list_del_init(&cur->nbl_list); WARN_ON(list_empty(&cur->nbl_lru)); list_del_init(&cur->nbl_lru); found = cur; break; } } spin_unlock(&nn->blocked_locks_lock); if (found) locks_delete_block(&found->nbl_lock); return found; } static struct nfsd4_blocked_lock * find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, struct nfsd_net *nn) { struct nfsd4_blocked_lock *nbl; nbl = find_blocked_lock(lo, fh, nn); if (!nbl) { nbl = kmalloc(sizeof(*nbl), GFP_KERNEL); if (nbl) { INIT_LIST_HEAD(&nbl->nbl_list); INIT_LIST_HEAD(&nbl->nbl_lru); fh_copy_shallow(&nbl->nbl_fh, fh); locks_init_lock(&nbl->nbl_lock); kref_init(&nbl->nbl_kref); nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, &nfsd4_cb_notify_lock_ops, NFSPROC4_CLNT_CB_NOTIFY_LOCK); } } return nbl; } static void free_nbl(struct kref *kref) { struct nfsd4_blocked_lock *nbl; nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref); locks_release_private(&nbl->nbl_lock); kfree(nbl); } static void free_blocked_lock(struct nfsd4_blocked_lock *nbl) { locks_delete_block(&nbl->nbl_lock); kref_put(&nbl->nbl_kref, free_nbl); } static void remove_blocked_locks(struct nfs4_lockowner *lo) { struct nfs4_client *clp = lo->lo_owner.so_client; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct nfsd4_blocked_lock *nbl; LIST_HEAD(reaplist); /* Dequeue all blocked locks */ spin_lock(&nn->blocked_locks_lock); while (!list_empty(&lo->lo_blocked)) { nbl = list_first_entry(&lo->lo_blocked, struct nfsd4_blocked_lock, nbl_list); list_del_init(&nbl->nbl_list); WARN_ON(list_empty(&nbl->nbl_lru)); list_move(&nbl->nbl_lru, &reaplist); } spin_unlock(&nn->blocked_locks_lock); /* Now free them */ while (!list_empty(&reaplist)) { nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); free_blocked_lock(nbl); } } static void nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb) { struct nfsd4_blocked_lock *nbl = container_of(cb, struct nfsd4_blocked_lock, nbl_cb); locks_delete_block(&nbl->nbl_lock); } static int nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) { trace_nfsd_cb_notify_lock_done(&zero_stateid, task); /* * Since this is just an optimization, we don't try very hard if it * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and * just quit trying on anything else. */ switch (task->tk_status) { case -NFS4ERR_DELAY: rpc_delay(task, 1 * HZ); return 0; default: return 1; } } static void nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb) { struct nfsd4_blocked_lock *nbl = container_of(cb, struct nfsd4_blocked_lock, nbl_cb); free_blocked_lock(nbl); } static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { .prepare = nfsd4_cb_notify_lock_prepare, .done = nfsd4_cb_notify_lock_done, .release = nfsd4_cb_notify_lock_release, .opcode = OP_CB_NOTIFY_LOCK, }; /* * We store the NONE, READ, WRITE, and BOTH bits separately in the * st_{access,deny}_bmap field of the stateid, in order to track not * only what share bits are currently in force, but also what * combinations of share bits previous opens have used. This allows us * to enforce the recommendation in * https://datatracker.ietf.org/doc/html/rfc7530#section-16.19.4 that * the server return an error if the client attempt to downgrade to a * combination of share bits not explicable by closing some of its * previous opens. * * This enforcement is arguably incomplete, since we don't keep * track of access/deny bit combinations; so, e.g., we allow: * * OPEN allow read, deny write * OPEN allow both, deny none * DOWNGRADE allow read, deny none * * which we should reject. * * But you could also argue that our current code is already overkill, * since it only exists to return NFS4ERR_INVAL on incorrect client * behavior. */ static unsigned int bmap_to_share_mode(unsigned long bmap) { int i; unsigned int access = 0; for (i = 1; i < 4; i++) { if (test_bit(i, &bmap)) access |= i; } return access; } /* set share access for a given stateid */ static inline void set_access(u32 access, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << access; WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); stp->st_access_bmap |= mask; } /* clear share access for a given stateid */ static inline void clear_access(u32 access, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << access; WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); stp->st_access_bmap &= ~mask; } /* test whether a given stateid has access */ static inline bool test_access(u32 access, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << access; return (bool)(stp->st_access_bmap & mask); } /* set share deny for a given stateid */ static inline void set_deny(u32 deny, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << deny; WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); stp->st_deny_bmap |= mask; } /* clear share deny for a given stateid */ static inline void clear_deny(u32 deny, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << deny; WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); stp->st_deny_bmap &= ~mask; } /* test whether a given stateid is denying specific access */ static inline bool test_deny(u32 deny, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << deny; return (bool)(stp->st_deny_bmap & mask); } static int nfs4_access_to_omode(u32 access) { switch (access & NFS4_SHARE_ACCESS_BOTH) { case NFS4_SHARE_ACCESS_READ: return O_RDONLY; case NFS4_SHARE_ACCESS_WRITE: return O_WRONLY; case NFS4_SHARE_ACCESS_BOTH: return O_RDWR; } WARN_ON_ONCE(1); return O_RDONLY; } static inline int access_permit_read(struct nfs4_ol_stateid *stp) { return test_access(NFS4_SHARE_ACCESS_READ, stp) || test_access(NFS4_SHARE_ACCESS_BOTH, stp) || test_access(NFS4_SHARE_ACCESS_WRITE, stp); } static inline int access_permit_write(struct nfs4_ol_stateid *stp) { return test_access(NFS4_SHARE_ACCESS_WRITE, stp) || test_access(NFS4_SHARE_ACCESS_BOTH, stp); } static inline struct nfs4_stateowner * nfs4_get_stateowner(struct nfs4_stateowner *sop) { atomic_inc(&sop->so_count); return sop; } static int same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner) { return (sop->so_owner.len == owner->len) && 0 == memcmp(sop->so_owner.data, owner->data, owner->len); } static struct nfs4_openowner * find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, struct nfs4_client *clp) { struct nfs4_stateowner *so; lockdep_assert_held(&clp->cl_lock); list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[hashval], so_strhash) { if (!so->so_is_open_owner) continue; if (same_owner_str(so, &open->op_owner)) return openowner(nfs4_get_stateowner(so)); } return NULL; } static inline u32 opaque_hashval(const void *ptr, int nbytes) { unsigned char *cptr = (unsigned char *) ptr; u32 x = 0; while (nbytes--) { x *= 37; x += *cptr++; } return x; } void put_nfs4_file(struct nfs4_file *fi) { if (refcount_dec_and_test(&fi->fi_ref)) { nfsd4_file_hash_remove(fi); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); kfree_rcu(fi, fi_rcu); } } static struct nfsd_file * find_writeable_file_locked(struct nfs4_file *f) { struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); ret = nfsd_file_get(f->fi_fds[O_WRONLY]); if (!ret) ret = nfsd_file_get(f->fi_fds[O_RDWR]); return ret; } static struct nfsd_file * find_writeable_file(struct nfs4_file *f) { struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_writeable_file_locked(f); spin_unlock(&f->fi_lock); return ret; } static struct nfsd_file * find_readable_file_locked(struct nfs4_file *f) { struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); ret = nfsd_file_get(f->fi_fds[O_RDONLY]); if (!ret) ret = nfsd_file_get(f->fi_fds[O_RDWR]); return ret; } static struct nfsd_file * find_readable_file(struct nfs4_file *f) { struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_readable_file_locked(f); spin_unlock(&f->fi_lock); return ret; } static struct nfsd_file * find_rw_file(struct nfs4_file *f) { struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = nfsd_file_get(f->fi_fds[O_RDWR]); spin_unlock(&f->fi_lock); return ret; } struct nfsd_file * find_any_file(struct nfs4_file *f) { struct nfsd_file *ret; if (!f) return NULL; spin_lock(&f->fi_lock); ret = nfsd_file_get(f->fi_fds[O_RDWR]); if (!ret) { ret = nfsd_file_get(f->fi_fds[O_WRONLY]); if (!ret) ret = nfsd_file_get(f->fi_fds[O_RDONLY]); } spin_unlock(&f->fi_lock); return ret; } static struct nfsd_file *find_any_file_locked(struct nfs4_file *f) { lockdep_assert_held(&f->fi_lock); if (f->fi_fds[O_RDWR]) return f->fi_fds[O_RDWR]; if (f->fi_fds[O_WRONLY]) return f->fi_fds[O_WRONLY]; if (f->fi_fds[O_RDONLY]) return f->fi_fds[O_RDONLY]; return NULL; } static atomic_long_t num_delegations; unsigned long max_delegations; /* * Open owner state (share locks) */ /* hash tables for lock and open owners */ #define OWNER_HASH_BITS 8 #define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) #define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) { unsigned int ret; ret = opaque_hashval(ownername->data, ownername->len); return ret & OWNER_HASH_MASK; } static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp; static const struct rhashtable_params nfs4_file_rhash_params = { .key_len = sizeof_field(struct nfs4_file, fi_inode), .key_offset = offsetof(struct nfs4_file, fi_inode), .head_offset = offsetof(struct nfs4_file, fi_rlist), /* * Start with a single page hash table to reduce resizing churn * on light workloads. */ .min_size = 256, .automatic_shrinking = true, }; /* * Check if courtesy clients have conflicting access and resolve it if possible * * access: is op_share_access if share_access is true. * Check if access mode, op_share_access, would conflict with * the current deny mode of the file 'fp'. * access: is op_share_deny if share_access is false. * Check if the deny mode, op_share_deny, would conflict with * current access of the file 'fp'. * stp: skip checking this entry. * new_stp: normal open, not open upgrade. * * Function returns: * false - access/deny mode conflict with normal client. * true - no conflict or conflict with courtesy client(s) is resolved. */ static bool nfs4_resolve_deny_conflicts_locked(struct nfs4_file *fp, bool new_stp, struct nfs4_ol_stateid *stp, u32 access, bool share_access) { struct nfs4_ol_stateid *st; bool resolvable = true; unsigned char bmap; struct nfsd_net *nn; struct nfs4_client *clp; lockdep_assert_held(&fp->fi_lock); list_for_each_entry(st, &fp->fi_stateids, st_perfile) { /* ignore lock stateid */ if (st->st_openstp) continue; if (st == stp && new_stp) continue; /* check file access against deny mode or vice versa */ bmap = share_access ? st->st_deny_bmap : st->st_access_bmap; if (!(access & bmap_to_share_mode(bmap))) continue; clp = st->st_stid.sc_client; if (try_to_expire_client(clp)) continue; resolvable = false; break; } if (resolvable) { clp = stp->st_stid.sc_client; nn = net_generic(clp->net, nfsd_net_id); mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); } return resolvable; } static void __nfs4_file_get_access(struct nfs4_file *fp, u32 access) { lockdep_assert_held(&fp->fi_lock); if (access & NFS4_SHARE_ACCESS_WRITE) atomic_inc(&fp->fi_access[O_WRONLY]); if (access & NFS4_SHARE_ACCESS_READ) atomic_inc(&fp->fi_access[O_RDONLY]); } static __be32 nfs4_file_get_access(struct nfs4_file *fp, u32 access) { lockdep_assert_held(&fp->fi_lock); /* Does this access mode make sense? */ if (access & ~NFS4_SHARE_ACCESS_BOTH) return nfserr_inval; /* Does it conflict with a deny mode already set? */ if ((access & fp->fi_share_deny) != 0) return nfserr_share_denied; __nfs4_file_get_access(fp, access); return nfs_ok; } static __be32 nfs4_file_check_deny(struct nfs4_file *fp, u32 deny) { /* Common case is that there is no deny mode. */ if (deny) { /* Does this deny mode make sense? */ if (deny & ~NFS4_SHARE_DENY_BOTH) return nfserr_inval; if ((deny & NFS4_SHARE_DENY_READ) && atomic_read(&fp->fi_access[O_RDONLY])) return nfserr_share_denied; if ((deny & NFS4_SHARE_DENY_WRITE) && atomic_read(&fp->fi_access[O_WRONLY])) return nfserr_share_denied; } return nfs_ok; } static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) { might_lock(&fp->fi_lock); if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) { struct nfsd_file *f1 = NULL; struct nfsd_file *f2 = NULL; swap(f1, fp->fi_fds[oflag]); if (atomic_read(&fp->fi_access[1 - oflag]) == 0) swap(f2, fp->fi_fds[O_RDWR]); spin_unlock(&fp->fi_lock); if (f1) nfsd_file_put(f1); if (f2) nfsd_file_put(f2); } } static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) { WARN_ON_ONCE(access & ~NFS4_SHARE_ACCESS_BOTH); if (access & NFS4_SHARE_ACCESS_WRITE) __nfs4_file_put_access(fp, O_WRONLY); if (access & NFS4_SHARE_ACCESS_READ) __nfs4_file_put_access(fp, O_RDONLY); } /* * Allocate a new open/delegation state counter. This is needed for * pNFS for proper return on close semantics. * * Note that we only allocate it for pNFS-enabled exports, otherwise * all pointers to struct nfs4_clnt_odstate are always NULL. */ static struct nfs4_clnt_odstate * alloc_clnt_odstate(struct nfs4_client *clp) { struct nfs4_clnt_odstate *co; co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL); if (co) { co->co_client = clp; refcount_set(&co->co_odcount, 1); } return co; } static void hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co) { struct nfs4_file *fp = co->co_file; lockdep_assert_held(&fp->fi_lock); list_add(&co->co_perfile, &fp->fi_clnt_odstate); } static inline void get_clnt_odstate(struct nfs4_clnt_odstate *co) { if (co) refcount_inc(&co->co_odcount); } static void put_clnt_odstate(struct nfs4_clnt_odstate *co) { struct nfs4_file *fp; if (!co) return; fp = co->co_file; if (refcount_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { list_del(&co->co_perfile); spin_unlock(&fp->fi_lock); nfsd4_return_all_file_layouts(co->co_client, fp); kmem_cache_free(odstate_slab, co); } } static struct nfs4_clnt_odstate * find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new) { struct nfs4_clnt_odstate *co; struct nfs4_client *cl; if (!new) return NULL; cl = new->co_client; spin_lock(&fp->fi_lock); list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) { if (co->co_client == cl) { get_clnt_odstate(co); goto out; } } co = new; co->co_file = fp; hash_clnt_odstate_locked(new); out: spin_unlock(&fp->fi_lock); return co; } struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, void (*sc_free)(struct nfs4_stid *)) { struct nfs4_stid *stid; int new_id; stid = kmem_cache_zalloc(slab, GFP_KERNEL); if (!stid) return NULL; idr_preload(GFP_KERNEL); spin_lock(&cl->cl_lock); /* Reserving 0 for start of file in nfsdfs "states" file: */ new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 1, 0, GFP_NOWAIT); spin_unlock(&cl->cl_lock); idr_preload_end(); if (new_id < 0) goto out_free; stid->sc_free = sc_free; stid->sc_client = cl; stid->sc_stateid.si_opaque.so_id = new_id; stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; /* Will be incremented before return to client: */ refcount_set(&stid->sc_count, 1); spin_lock_init(&stid->sc_lock); INIT_LIST_HEAD(&stid->sc_cp_list); /* * It shouldn't be a problem to reuse an opaque stateid value. * I don't think it is for 4.1. But with 4.0 I worry that, for * example, a stray write retransmission could be accepted by * the server when it should have been rejected. Therefore, * adopt a trick from the sctp code to attempt to maximize the * amount of time until an id is reused, by ensuring they always * "increase" (mod INT_MAX): */ return stid; out_free: kmem_cache_free(slab, stid); return NULL; } /* * Create a unique stateid_t to represent each COPY. */ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, unsigned char cs_type) { int new_id; stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time; stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; idr_preload(GFP_KERNEL); spin_lock(&nn->s2s_cp_lock); new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT); stid->cs_stid.si_opaque.so_id = new_id; stid->cs_stid.si_generation = 1; spin_unlock(&nn->s2s_cp_lock); idr_preload_end(); if (new_id < 0) return 0; stid->cs_type = cs_type; return 1; } int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy) { return nfs4_init_cp_state(nn, &copy->cp_stateid, NFS4_COPY_STID); } struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn, struct nfs4_stid *p_stid) { struct nfs4_cpntf_state *cps; cps = kzalloc(sizeof(struct nfs4_cpntf_state), GFP_KERNEL); if (!cps) return NULL; cps->cpntf_time = ktime_get_boottime_seconds(); refcount_set(&cps->cp_stateid.cs_count, 1); if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID)) goto out_free; spin_lock(&nn->s2s_cp_lock); list_add(&cps->cp_list, &p_stid->sc_cp_list); spin_unlock(&nn->s2s_cp_lock); return cps; out_free: kfree(cps); return NULL; } void nfs4_free_copy_state(struct nfsd4_copy *copy) { struct nfsd_net *nn; if (copy->cp_stateid.cs_type != NFS4_COPY_STID) return; nn = net_generic(copy->cp_clp->net, nfsd_net_id); spin_lock(&nn->s2s_cp_lock); idr_remove(&nn->s2s_cp_stateids, copy->cp_stateid.cs_stid.si_opaque.so_id); spin_unlock(&nn->s2s_cp_lock); } static void nfs4_free_cpntf_statelist(struct net *net, struct nfs4_stid *stid) { struct nfs4_cpntf_state *cps; struct nfsd_net *nn; nn = net_generic(net, nfsd_net_id); spin_lock(&nn->s2s_cp_lock); while (!list_empty(&stid->sc_cp_list)) { cps = list_first_entry(&stid->sc_cp_list, struct nfs4_cpntf_state, cp_list); _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); } static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) { struct nfs4_stid *stid; stid = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_ol_stateid); if (!stid) return NULL; return openlockstateid(stid); } static void nfs4_free_deleg(struct nfs4_stid *stid) { struct nfs4_delegation *dp = delegstateid(stid); WARN_ON_ONCE(!list_empty(&stid->sc_cp_list)); WARN_ON_ONCE(!list_empty(&dp->dl_perfile)); WARN_ON_ONCE(!list_empty(&dp->dl_perclnt)); WARN_ON_ONCE(!list_empty(&dp->dl_recall_lru)); kmem_cache_free(deleg_slab, stid); atomic_long_dec(&num_delegations); } /* * When we recall a delegation, we should be careful not to hand it * out again straight away. * To ensure this we keep a pair of bloom filters ('new' and 'old') * in which the filehandles of recalled delegations are "stored". * If a filehandle appear in either filter, a delegation is blocked. * When a delegation is recalled, the filehandle is stored in the "new" * filter. * Every 30 seconds we swap the filters and clear the "new" one, * unless both are empty of course. This results in delegations for a * given filehandle being blocked for between 30 and 60 seconds. * * Each filter is 256 bits. We hash the filehandle to 32bit and use the * low 3 bytes as hash-table indices. * * 'blocked_delegations_lock', which is always taken in block_delegations(), * is used to manage concurrent access. Testing does not need the lock * except when swapping the two filters. */ static DEFINE_SPINLOCK(blocked_delegations_lock); static struct bloom_pair { int entries, old_entries; time64_t swap_time; int new; /* index into 'set' */ DECLARE_BITMAP(set[2], 256); } blocked_delegations; static int delegation_blocked(struct knfsd_fh *fh) { u32 hash; struct bloom_pair *bd = &blocked_delegations; if (bd->entries == 0) return 0; if (ktime_get_seconds() - bd->swap_time > 30) { spin_lock(&blocked_delegations_lock); if (ktime_get_seconds() - bd->swap_time > 30) { bd->entries -= bd->old_entries; bd->old_entries = bd->entries; bd->new = 1-bd->new; memset(bd->set[bd->new], 0, sizeof(bd->set[0])); bd->swap_time = ktime_get_seconds(); } spin_unlock(&blocked_delegations_lock); } hash = jhash(&fh->fh_raw, fh->fh_size, 0); if (test_bit(hash&255, bd->set[0]) && test_bit((hash>>8)&255, bd->set[0]) && test_bit((hash>>16)&255, bd->set[0])) return 1; if (test_bit(hash&255, bd->set[1]) && test_bit((hash>>8)&255, bd->set[1]) && test_bit((hash>>16)&255, bd->set[1])) return 1; return 0; } static void block_delegations(struct knfsd_fh *fh) { u32 hash; struct bloom_pair *bd = &blocked_delegations; hash = jhash(&fh->fh_raw, fh->fh_size, 0); spin_lock(&blocked_delegations_lock); __set_bit(hash&255, bd->set[bd->new]); __set_bit((hash>>8)&255, bd->set[bd->new]); __set_bit((hash>>16)&255, bd->set[bd->new]); if (bd->entries == 0) bd->swap_time = ktime_get_seconds(); bd->entries += 1; spin_unlock(&blocked_delegations_lock); } static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate, u32 dl_type) { struct nfs4_delegation *dp; struct nfs4_stid *stid; long n; dprintk("NFSD alloc_init_deleg\n"); n = atomic_long_inc_return(&num_delegations); if (n < 0 || n > max_delegations) goto out_dec; if (delegation_blocked(&fp->fi_fhandle)) goto out_dec; stid = nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg); if (stid == NULL) goto out_dec; dp = delegstateid(stid); /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid * 0 anyway just for consistency and use 1: */ dp->dl_stid.sc_stateid.si_generation = 1; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); dp->dl_clnt_odstate = odstate; get_clnt_odstate(odstate); dp->dl_type = dl_type; dp->dl_retries = 1; dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client, &nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR); dp->dl_cb_fattr.ncf_file_modified = false; get_nfs4_file(fp); dp->dl_stid.sc_file = fp; return dp; out_dec: atomic_long_dec(&num_delegations); return NULL; } void nfs4_put_stid(struct nfs4_stid *s) { struct nfs4_file *fp = s->sc_file; struct nfs4_client *clp = s->sc_client; might_lock(&clp->cl_lock); if (!refcount_dec_and_lock(&s->sc_count, &clp->cl_lock)) { wake_up_all(&close_wq); return; } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); if (s->sc_status & SC_STATUS_ADMIN_REVOKED) atomic_dec(&s->sc_client->cl_admin_revoked); nfs4_free_cpntf_statelist(clp->net, s); spin_unlock(&clp->cl_lock); s->sc_free(s); if (fp) put_nfs4_file(fp); } void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) { stateid_t *src = &stid->sc_stateid; spin_lock(&stid->sc_lock); if (unlikely(++src->si_generation == 0)) src->si_generation = 1; memcpy(dst, src, sizeof(*dst)); spin_unlock(&stid->sc_lock); } static void put_deleg_file(struct nfs4_file *fp) { struct nfsd_file *nf = NULL; spin_lock(&fp->fi_lock); if (--fp->fi_delegees == 0) swap(nf, fp->fi_deleg_file); spin_unlock(&fp->fi_lock); if (nf) nfsd_file_put(nf); } static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; struct nfsd_file *nf = fp->fi_deleg_file; WARN_ON_ONCE(!fp->fi_delegees); kernel_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); put_deleg_file(fp); } static void destroy_unhashed_deleg(struct nfs4_delegation *dp) { put_clnt_odstate(dp->dl_clnt_odstate); nfs4_unlock_deleg_lease(dp); nfs4_put_stid(&dp->dl_stid); } /** * nfs4_delegation_exists - Discover if this delegation already exists * @clp: a pointer to the nfs4_client we're granting a delegation to * @fp: a pointer to the nfs4_file we're granting a delegation on * * Return: * On success: true iff an existing delegation is found */ static bool nfs4_delegation_exists(struct nfs4_client *clp, struct nfs4_file *fp) { struct nfs4_delegation *searchdp = NULL; struct nfs4_client *searchclp = NULL; lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) { searchclp = searchdp->dl_stid.sc_client; if (clp == searchclp) { return true; } } return false; } /** * hash_delegation_locked - Add a delegation to the appropriate lists * @dp: a pointer to the nfs4_delegation we are adding. * @fp: a pointer to the nfs4_file we're granting a delegation on * * Return: * On success: NULL if the delegation was successfully hashed. * * On error: -EAGAIN if one was previously granted to this * nfs4_client for this nfs4_file. Delegation is not hashed. * */ static int hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) { struct nfs4_client *clp = dp->dl_stid.sc_client; lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); lockdep_assert_held(&clp->cl_lock); if (nfs4_delegation_exists(clp, fp)) return -EAGAIN; refcount_inc(&dp->dl_stid.sc_count); dp->dl_stid.sc_type = SC_TYPE_DELEG; list_add(&dp->dl_perfile, &fp->fi_delegations); list_add(&dp->dl_perclnt, &clp->cl_delegations); return 0; } static bool delegation_hashed(struct nfs4_delegation *dp) { return !(list_empty(&dp->dl_perfile)); } static bool unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask) { struct nfs4_file *fp = dp->dl_stid.sc_file; lockdep_assert_held(&state_lock); if (!delegation_hashed(dp)) return false; if (statusmask == SC_STATUS_REVOKED && dp->dl_stid.sc_client->cl_minorversion == 0) statusmask = SC_STATUS_CLOSED; dp->dl_stid.sc_status |= statusmask; if (statusmask & SC_STATUS_ADMIN_REVOKED) atomic_inc(&dp->dl_stid.sc_client->cl_admin_revoked); /* Ensure that deleg break won't try to requeue it */ ++dp->dl_time; spin_lock(&fp->fi_lock); list_del_init(&dp->dl_perclnt); list_del_init(&dp->dl_recall_lru); list_del_init(&dp->dl_perfile); spin_unlock(&fp->fi_lock); return true; } static void destroy_delegation(struct nfs4_delegation *dp) { bool unhashed; spin_lock(&state_lock); unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED); spin_unlock(&state_lock); if (unhashed) destroy_unhashed_deleg(dp); } /** * revoke_delegation - perform nfs4 delegation structure cleanup * @dp: pointer to the delegation * * This function assumes that it's called either from the administrative * interface (nfsd4_revoke_states()) that's revoking a specific delegation * stateid or it's called from a laundromat thread (nfsd4_landromat()) that * determined that this specific state has expired and needs to be revoked * (both mark state with the appropriate stid sc_status mode). It is also * assumed that a reference was taken on the @dp state. * * If this function finds that the @dp state is SC_STATUS_FREED it means * that a FREE_STATEID operation for this stateid has been processed and * we can proceed to removing it from recalled list. However, if @dp state * isn't marked SC_STATUS_FREED, it means we need place it on the cl_revoked * list and wait for the FREE_STATEID to arrive from the client. At the same * time, we need to mark it as SC_STATUS_FREEABLE to indicate to the * nfsd4_free_stateid() function that this stateid has already been added * to the cl_revoked list and that nfsd4_free_stateid() is now responsible * for removing it from the list. Inspection of where the delegation state * in the revocation process is protected by the clp->cl_lock. */ static void revoke_delegation(struct nfs4_delegation *dp) { struct nfs4_client *clp = dp->dl_stid.sc_client; WARN_ON(!list_empty(&dp->dl_recall_lru)); WARN_ON_ONCE(!(dp->dl_stid.sc_status & (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED))); trace_nfsd_stid_revoke(&dp->dl_stid); spin_lock(&clp->cl_lock); if (dp->dl_stid.sc_status & SC_STATUS_FREED) { list_del_init(&dp->dl_recall_lru); goto out; } list_add(&dp->dl_recall_lru, &clp->cl_revoked); dp->dl_stid.sc_status |= SC_STATUS_FREEABLE; out: spin_unlock(&clp->cl_lock); destroy_unhashed_deleg(dp); } /* * SETCLIENTID state */ static unsigned int clientid_hashval(u32 id) { return id & CLIENT_HASH_MASK; } static unsigned int clientstr_hashval(struct xdr_netobj name) { return opaque_hashval(name.data, 8) & CLIENT_HASH_MASK; } /* * A stateid that had a deny mode associated with it is being released * or downgraded. Recalculate the deny mode on the file. */ static void recalculate_deny_mode(struct nfs4_file *fp) { struct nfs4_ol_stateid *stp; u32 old_deny; spin_lock(&fp->fi_lock); old_deny = fp->fi_share_deny; fp->fi_share_deny = 0; list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap); if (fp->fi_share_deny == old_deny) break; } spin_unlock(&fp->fi_lock); } static void reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp) { int i; bool change = false; for (i = 1; i < 4; i++) { if ((i & deny) != i) { change = true; clear_deny(i, stp); } } /* Recalculate per-file deny mode if there was a change */ if (change) recalculate_deny_mode(stp->st_stid.sc_file); } /* release all access and file references for a given stateid */ static void release_all_access(struct nfs4_ol_stateid *stp) { int i; struct nfs4_file *fp = stp->st_stid.sc_file; if (fp && stp->st_deny_bmap != 0) recalculate_deny_mode(fp); for (i = 1; i < 4; i++) { if (test_access(i, stp)) nfs4_file_put_access(stp->st_stid.sc_file, i); clear_access(i, stp); } } static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop) { kfree(sop->so_owner.data); sop->so_ops->so_free(sop); } static void nfs4_put_stateowner(struct nfs4_stateowner *sop) { struct nfs4_client *clp = sop->so_client; might_lock(&clp->cl_lock); if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock)) return; sop->so_ops->so_unhash(sop); spin_unlock(&clp->cl_lock); nfs4_free_stateowner(sop); } static bool nfs4_ol_stateid_unhashed(const struct nfs4_ol_stateid *stp) { return list_empty(&stp->st_perfile); } static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_file *fp = stp->st_stid.sc_file; lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock); if (list_empty(&stp->st_perfile)) return false; spin_lock(&fp->fi_lock); list_del_init(&stp->st_perfile); spin_unlock(&fp->fi_lock); list_del(&stp->st_perstateowner); return true; } static void nfs4_free_ol_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); put_clnt_odstate(stp->st_clnt_odstate); release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); WARN_ON(!list_empty(&stid->sc_cp_list)); kmem_cache_free(stateid_slab, stid); } static void nfs4_free_lock_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); struct nfsd_file *nf; nf = find_any_file(stp->st_stid.sc_file); if (nf) { get_file(nf->nf_file); filp_close(nf->nf_file, (fl_owner_t)lo); nfsd_file_put(nf); } nfs4_free_ol_stateid(stid); } /* * Put the persistent reference to an already unhashed generic stateid, while * holding the cl_lock. If it's the last reference, then put it onto the * reaplist for later destruction. */ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, struct list_head *reaplist) { struct nfs4_stid *s = &stp->st_stid; struct nfs4_client *clp = s->sc_client; lockdep_assert_held(&clp->cl_lock); WARN_ON_ONCE(!list_empty(&stp->st_locks)); if (!refcount_dec_and_test(&s->sc_count)) { wake_up_all(&close_wq); return; } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); if (s->sc_status & SC_STATUS_ADMIN_REVOKED) atomic_dec(&s->sc_client->cl_admin_revoked); list_add(&stp->st_locks, reaplist); } static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) { lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); if (!unhash_ol_stateid(stp)) return false; list_del_init(&stp->st_locks); stp->st_stid.sc_status |= SC_STATUS_CLOSED; return true; } static void release_lock_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_client *clp = stp->st_stid.sc_client; bool unhashed; spin_lock(&clp->cl_lock); unhashed = unhash_lock_stateid(stp); spin_unlock(&clp->cl_lock); if (unhashed) nfs4_put_stid(&stp->st_stid); } static void unhash_lockowner_locked(struct nfs4_lockowner *lo) { struct nfs4_client *clp = lo->lo_owner.so_client; lockdep_assert_held(&clp->cl_lock); list_del_init(&lo->lo_owner.so_strhash); } /* * Free a list of generic stateids that were collected earlier after being * fully unhashed. */ static void free_ol_stateid_reaplist(struct list_head *reaplist) { struct nfs4_ol_stateid *stp; struct nfs4_file *fp; might_sleep(); while (!list_empty(reaplist)) { stp = list_first_entry(reaplist, struct nfs4_ol_stateid, st_locks); list_del(&stp->st_locks); fp = stp->st_stid.sc_file; stp->st_stid.sc_free(&stp->st_stid); if (fp) put_nfs4_file(fp); } } static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, struct list_head *reaplist) { struct nfs4_ol_stateid *stp; lockdep_assert_held(&open_stp->st_stid.sc_client->cl_lock); while (!list_empty(&open_stp->st_locks)) { stp = list_entry(open_stp->st_locks.next, struct nfs4_ol_stateid, st_locks); unhash_lock_stateid(stp); put_ol_stateid_locked(stp, reaplist); } } static bool unhash_open_stateid(struct nfs4_ol_stateid *stp, struct list_head *reaplist) { lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); if (!unhash_ol_stateid(stp)) return false; release_open_stateid_locks(stp, reaplist); return true; } static void release_open_stateid(struct nfs4_ol_stateid *stp) { LIST_HEAD(reaplist); spin_lock(&stp->st_stid.sc_client->cl_lock); stp->st_stid.sc_status |= SC_STATUS_CLOSED; if (unhash_open_stateid(stp, &reaplist)) put_ol_stateid_locked(stp, &reaplist); spin_unlock(&stp->st_stid.sc_client->cl_lock); free_ol_stateid_reaplist(&reaplist); } static bool nfs4_openowner_unhashed(struct nfs4_openowner *oo) { lockdep_assert_held(&oo->oo_owner.so_client->cl_lock); return list_empty(&oo->oo_owner.so_strhash) && list_empty(&oo->oo_perclient); } static void unhash_openowner_locked(struct nfs4_openowner *oo) { struct nfs4_client *clp = oo->oo_owner.so_client; lockdep_assert_held(&clp->cl_lock); list_del_init(&oo->oo_owner.so_strhash); list_del_init(&oo->oo_perclient); } static void release_last_closed_stateid(struct nfs4_openowner *oo) { struct nfsd_net *nn = net_generic(oo->oo_owner.so_client->net, nfsd_net_id); struct nfs4_ol_stateid *s; spin_lock(&nn->client_lock); s = oo->oo_last_closed_stid; if (s) { list_del_init(&oo->oo_close_lru); oo->oo_last_closed_stid = NULL; } spin_unlock(&nn->client_lock); if (s) nfs4_put_stid(&s->st_stid); } static void release_openowner(struct nfs4_openowner *oo) { struct nfs4_ol_stateid *stp; struct nfs4_client *clp = oo->oo_owner.so_client; LIST_HEAD(reaplist); spin_lock(&clp->cl_lock); unhash_openowner_locked(oo); while (!list_empty(&oo->oo_owner.so_stateids)) { stp = list_first_entry(&oo->oo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); if (unhash_open_stateid(stp, &reaplist)) put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); release_last_closed_stateid(oo); nfs4_put_stateowner(&oo->oo_owner); } static struct nfs4_stid *find_one_sb_stid(struct nfs4_client *clp, struct super_block *sb, unsigned int sc_types) { unsigned long id, tmp; struct nfs4_stid *stid; spin_lock(&clp->cl_lock); idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) if ((stid->sc_type & sc_types) && stid->sc_status == 0 && stid->sc_file->fi_inode->i_sb == sb) { refcount_inc(&stid->sc_count); break; } spin_unlock(&clp->cl_lock); return stid; } /** * nfsd4_revoke_states - revoke all nfsv4 states associated with given filesystem * @net: used to identify instance of nfsd (there is one per net namespace) * @sb: super_block used to identify target filesystem * * All nfs4 states (open, lock, delegation, layout) held by the server instance * and associated with a file on the given filesystem will be revoked resulting * in any files being closed and so all references from nfsd to the filesystem * being released. Thus nfsd will no longer prevent the filesystem from being * unmounted. * * The clients which own the states will subsequently being notified that the * states have been "admin-revoked". */ void nfsd4_revoke_states(struct net *net, struct super_block *sb) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); unsigned int idhashval; unsigned int sc_types; sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG | SC_TYPE_LAYOUT; spin_lock(&nn->client_lock); for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) { struct list_head *head = &nn->conf_id_hashtbl[idhashval]; struct nfs4_client *clp; retry: list_for_each_entry(clp, head, cl_idhash) { struct nfs4_stid *stid = find_one_sb_stid(clp, sb, sc_types); if (stid) { struct nfs4_ol_stateid *stp; struct nfs4_delegation *dp; struct nfs4_layout_stateid *ls; spin_unlock(&nn->client_lock); switch (stid->sc_type) { case SC_TYPE_OPEN: stp = openlockstateid(stid); mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); spin_lock(&clp->cl_lock); if (stid->sc_status == 0) { stid->sc_status |= SC_STATUS_ADMIN_REVOKED; atomic_inc(&clp->cl_admin_revoked); spin_unlock(&clp->cl_lock); release_all_access(stp); } else spin_unlock(&clp->cl_lock); mutex_unlock(&stp->st_mutex); break; case SC_TYPE_LOCK: stp = openlockstateid(stid); mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); spin_lock(&clp->cl_lock); if (stid->sc_status == 0) { struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); struct nfsd_file *nf; stid->sc_status |= SC_STATUS_ADMIN_REVOKED; atomic_inc(&clp->cl_admin_revoked); spin_unlock(&clp->cl_lock); nf = find_any_file(stp->st_stid.sc_file); if (nf) { get_file(nf->nf_file); filp_close(nf->nf_file, (fl_owner_t)lo); nfsd_file_put(nf); } release_all_access(stp); } else spin_unlock(&clp->cl_lock); mutex_unlock(&stp->st_mutex); break; case SC_TYPE_DELEG: refcount_inc(&stid->sc_count); dp = delegstateid(stid); spin_lock(&state_lock); if (!unhash_delegation_locked( dp, SC_STATUS_ADMIN_REVOKED)) dp = NULL; spin_unlock(&state_lock); if (dp) revoke_delegation(dp); break; case SC_TYPE_LAYOUT: ls = layoutstateid(stid); nfsd4_close_layout(ls); break; } nfs4_put_stid(stid); spin_lock(&nn->client_lock); if (clp->cl_minorversion == 0) /* Allow cleanup after a lease period. * store_release ensures cleanup will * see any newly revoked states if it * sees the time updated. */ nn->nfs40_last_revoke = ktime_get_boottime_seconds(); goto retry; } } } spin_unlock(&nn->client_lock); } static inline int hash_sessionid(struct nfs4_sessionid *sessionid) { struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid; return sid->sequence % SESSION_HASH_SIZE; } #ifdef CONFIG_SUNRPC_DEBUG static inline void dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) { u32 *ptr = (u32 *)(&sessionid->data[0]); dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]); } #else static inline void dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) { } #endif /* * Bump the seqid on cstate->replay_owner, and clear replay_owner if it * won't be used for replay. */ void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr) { struct nfs4_stateowner *so = cstate->replay_owner; if (nfserr == nfserr_replay_me) return; if (!seqid_mutating_err(ntohl(nfserr))) { nfsd4_cstate_clear_replay(cstate); return; } if (!so) return; if (so->so_is_open_owner) release_last_closed_stateid(openowner(so)); so->so_seqid++; return; } static void gen_sessionid(struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; struct nfsd4_sessionid *sid; sid = (struct nfsd4_sessionid *)ses->se_sessionid.data; sid->clientid = clp->cl_clientid; sid->sequence = current_sessionid++; sid->reserved = 0; } /* * The protocol defines ca_maxresponssize_cached to include the size of * the rpc header, but all we need to cache is the data starting after * the end of the initial SEQUENCE operation--the rest we regenerate * each time. Therefore we can advertise a ca_maxresponssize_cached * value that is the number of bytes in our cache plus a few additional * bytes. In order to stay on the safe side, and not promise more than * we can cache, those additional bytes must be the minimum possible: 24 * bytes of rpc header (xid through accept state, with AUTH_NULL * verifier), 12 for the compound header (with zero-length tag), and 44 * for the SEQUENCE op response: */ #define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) static void free_session_slots(struct nfsd4_session *ses) { int i; for (i = 0; i < ses->se_fchannel.maxreqs; i++) { free_svc_cred(&ses->se_slots[i]->sl_cred); kfree(ses->se_slots[i]); } } /* * We don't actually need to cache the rpc and session headers, so we * can allocate a little less for each slot: */ static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) { u32 size; if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ) size = 0; else size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; return size + sizeof(struct nfsd4_slot); } /* * XXX: If we run out of reserved DRC memory we could (up to a point) * re-negotiate active sessions and reduce their slot usage to make * room for new connections. For now we just fail the create session. */ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) { u32 slotsize = slot_bytes(ca); u32 num = ca->maxreqs; unsigned long avail, total_avail; unsigned int scale_factor; spin_lock(&nfsd_drc_lock); if (nfsd_drc_max_mem > nfsd_drc_mem_used) total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; else /* We have handed out more space than we chose in * set_max_drc() to allow. That isn't really a * problem as long as that doesn't make us think we * have lots more due to integer overflow. */ total_avail = 0; avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* * Never use more than a fraction of the remaining memory, * unless it's the only way to give this client a slot. * The chosen fraction is either 1/8 or 1/number of threads, * whichever is smaller. This ensures there are adequate * slots to support multiple clients per thread. * Give the client one slot even if that would require * over-allocation--it is better than failure. */ scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads); avail = clamp_t(unsigned long, avail, slotsize, total_avail/scale_factor); num = min_t(int, num, avail / slotsize); num = max_t(int, num, 1); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); return num; } static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca) { int slotsize = slot_bytes(ca); spin_lock(&nfsd_drc_lock); nfsd_drc_mem_used -= slotsize * ca->maxreqs; spin_unlock(&nfsd_drc_lock); } static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, struct nfsd4_channel_attrs *battrs) { int numslots = fattrs->maxreqs; int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; int i; BUILD_BUG_ON(struct_size(new, se_slots, NFSD_MAX_SLOTS_PER_SESSION) > PAGE_SIZE); new = kzalloc(struct_size(new, se_slots, numslots), GFP_KERNEL); if (!new) return NULL; /* allocate each struct nfsd4_slot and data cache in one piece */ for (i = 0; i < numslots; i++) { new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL); if (!new->se_slots[i]) goto out_free; } memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs)); new->se_cb_slot_avail = ~0U; new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_SIZE - 1); spin_lock_init(&new->se_lock); return new; out_free: while (i--) kfree(new->se_slots[i]); kfree(new); return NULL; } static void free_conn(struct nfsd4_conn *c) { svc_xprt_put(c->cn_xprt); kfree(c); } static void nfsd4_conn_lost(struct svc_xpt_user *u) { struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user); struct nfs4_client *clp = c->cn_session->se_client; trace_nfsd_cb_lost(clp); spin_lock(&clp->cl_lock); if (!list_empty(&c->cn_persession)) { list_del(&c->cn_persession); free_conn(c); } nfsd4_probe_callback(clp); spin_unlock(&clp->cl_lock); } static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) { struct nfsd4_conn *conn; conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL); if (!conn) return NULL; svc_xprt_get(rqstp->rq_xprt); conn->cn_xprt = rqstp->rq_xprt; conn->cn_flags = flags; INIT_LIST_HEAD(&conn->cn_xpt_user.list); return conn; } static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) { conn->cn_session = ses; list_add(&conn->cn_persession, &ses->se_conns); } static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; spin_lock(&clp->cl_lock); __nfsd4_hash_conn(conn, ses); spin_unlock(&clp->cl_lock); } static int nfsd4_register_conn(struct nfsd4_conn *conn) { conn->cn_xpt_user.callback = nfsd4_conn_lost; return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); } static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, struct nfsd4_session *ses) { int ret; nfsd4_hash_conn(conn, ses); ret = nfsd4_register_conn(conn); if (ret) /* oops; xprt is already down: */ nfsd4_conn_lost(&conn->cn_xpt_user); /* We may have gained or lost a callback channel: */ nfsd4_probe_callback_sync(ses->se_client); } static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses) { u32 dir = NFS4_CDFC4_FORE; if (cses->flags & SESSION4_BACK_CHAN) dir |= NFS4_CDFC4_BACK; return alloc_conn(rqstp, dir); } /* must be called under client_lock */ static void nfsd4_del_conns(struct nfsd4_session *s) { struct nfs4_client *clp = s->se_client; struct nfsd4_conn *c; spin_lock(&clp->cl_lock); while (!list_empty(&s->se_conns)) { c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession); list_del_init(&c->cn_persession); spin_unlock(&clp->cl_lock); unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user); free_conn(c); spin_lock(&clp->cl_lock); } spin_unlock(&clp->cl_lock); } static void __free_session(struct nfsd4_session *ses) { free_session_slots(ses); kfree(ses); } static void free_session(struct nfsd4_session *ses) { nfsd4_del_conns(ses); nfsd4_put_drc_mem(&ses->se_fchannel); __free_session(ses); } static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) { int idx; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); new->se_client = clp; gen_sessionid(new); INIT_LIST_HEAD(&new->se_conns); atomic_set(&new->se_ref, 0); new->se_dead = false; new->se_cb_prog = cses->callback_prog; new->se_cb_sec = cses->cb_sec; for (idx = 0; idx < NFSD_BC_SLOT_TABLE_SIZE; ++idx) new->se_cb_seq_nr[idx] = 1; idx = hash_sessionid(&new->se_sessionid); list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); spin_lock(&clp->cl_lock); list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); { struct sockaddr *sa = svc_addr(rqstp); /* * This is a little silly; with sessions there's no real * use for the callback address. Use the peer address * as a reasonable default for now, but consider fixing * the rpc client not to require an address in the * future: */ rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); } } /* caller must hold client_lock */ static struct nfsd4_session * __find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) { struct nfsd4_session *elem; int idx; struct nfsd_net *nn = net_generic(net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); dump_sessionid(__func__, sessionid); idx = hash_sessionid(sessionid); /* Search in the appropriate list */ list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) { if (!memcmp(elem->se_sessionid.data, sessionid->data, NFS4_MAX_SESSIONID_LEN)) { return elem; } } dprintk("%s: session not found\n", __func__); return NULL; } static struct nfsd4_session * find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net, __be32 *ret) { struct nfsd4_session *session; __be32 status = nfserr_badsession; session = __find_in_sessionid_hashtbl(sessionid, net); if (!session) goto out; status = nfsd4_get_session_locked(session); if (status) session = NULL; out: *ret = status; return session; } /* caller must hold client_lock */ static void unhash_session(struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); list_del(&ses->se_hash); spin_lock(&ses->se_client->cl_lock); list_del(&ses->se_perclnt); spin_unlock(&ses->se_client->cl_lock); } /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ static int STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) { /* * We're assuming the clid was not given out from a boot * precisely 2^32 (about 136 years) before this one. That seems * a safe assumption: */ if (clid->cl_boot == (u32)nn->boot_time) return 0; trace_nfsd_clid_stale(clid); return 1; } static struct nfs4_client *alloc_client(struct xdr_netobj name, struct nfsd_net *nn) { struct nfs4_client *clp; int i; if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients && atomic_read(&nn->nfsd_courtesy_clients) > 0) mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); clp = kmem_cache_zalloc(client_slab, GFP_KERNEL); if (clp == NULL) return NULL; xdr_netobj_dup(&clp->cl_name, &name, GFP_KERNEL); if (clp->cl_name.data == NULL) goto err_no_name; clp->cl_ownerstr_hashtbl = kmalloc_array(OWNER_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!clp->cl_ownerstr_hashtbl) goto err_no_hashtbl; clp->cl_callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0); if (!clp->cl_callback_wq) goto err_no_callback_wq; for (i = 0; i < OWNER_HASH_SIZE; i++) INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]); INIT_LIST_HEAD(&clp->cl_sessions); idr_init(&clp->cl_stateids); atomic_set(&clp->cl_rpc_users, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; clp->cl_state = NFSD4_ACTIVE; atomic_inc(&nn->nfs4_client_count); atomic_set(&clp->cl_delegs_in_recall, 0); INIT_LIST_HEAD(&clp->cl_idhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); INIT_LIST_HEAD(&clp->cl_revoked); #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&clp->cl_lo_states); #endif INIT_LIST_HEAD(&clp->async_copies); spin_lock_init(&clp->async_lock); spin_lock_init(&clp->cl_lock); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; err_no_callback_wq: kfree(clp->cl_ownerstr_hashtbl); err_no_hashtbl: kfree(clp->cl_name.data); err_no_name: kmem_cache_free(client_slab, clp); return NULL; } static void __free_client(struct kref *k) { struct nfsdfs_client *c = container_of(k, struct nfsdfs_client, cl_ref); struct nfs4_client *clp = container_of(c, struct nfs4_client, cl_nfsdfs); free_svc_cred(&clp->cl_cred); destroy_workqueue(clp->cl_callback_wq); kfree(clp->cl_ownerstr_hashtbl); kfree(clp->cl_name.data); kfree(clp->cl_nii_domain.data); kfree(clp->cl_nii_name.data); idr_destroy(&clp->cl_stateids); kfree(clp->cl_ra); kmem_cache_free(client_slab, clp); } static void drop_client(struct nfs4_client *clp) { kref_put(&clp->cl_nfsdfs.cl_ref, __free_client); } static void free_client(struct nfs4_client *clp) { while (!list_empty(&clp->cl_sessions)) { struct nfsd4_session *ses; ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, se_perclnt); list_del(&ses->se_perclnt); WARN_ON_ONCE(atomic_read(&ses->se_ref)); free_session(ses); } rpc_destroy_wait_queue(&clp->cl_cb_waitq); if (clp->cl_nfsd_dentry) { nfsd_client_rmdir(clp->cl_nfsd_dentry); clp->cl_nfsd_dentry = NULL; wake_up_all(&expiry_wq); } drop_client(clp); } /* must be called under the client_lock */ static void unhash_client_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct nfsd4_session *ses; lockdep_assert_held(&nn->client_lock); /* Mark the client as expired! */ clp->cl_time = 0; /* Make it invisible */ if (!list_empty(&clp->cl_idhash)) { list_del_init(&clp->cl_idhash); if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) rb_erase(&clp->cl_namenode, &nn->conf_name_tree); else rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); } list_del_init(&clp->cl_lru); spin_lock(&clp->cl_lock); list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) list_del_init(&ses->se_hash); spin_unlock(&clp->cl_lock); } static void unhash_client(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); spin_lock(&nn->client_lock); unhash_client_locked(clp); spin_unlock(&nn->client_lock); } static __be32 mark_client_expired_locked(struct nfs4_client *clp) { int users = atomic_read(&clp->cl_rpc_users); trace_nfsd_mark_client_expired(clp, users); if (users) return nfserr_jukebox; unhash_client_locked(clp); return nfs_ok; } static void __destroy_client(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); int i; struct nfs4_openowner *oo; struct nfs4_delegation *dp; LIST_HEAD(reaplist); spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); destroy_unhashed_deleg(dp); } while (!list_empty(&clp->cl_revoked)) { dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); nfs4_put_stid(&dp->dl_stid); } while (!list_empty(&clp->cl_openowners)) { oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); nfs4_get_stateowner(&oo->oo_owner); release_openowner(oo); } for (i = 0; i < OWNER_HASH_SIZE; i++) { struct nfs4_stateowner *so, *tmp; list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i], so_strhash) { /* Should be no openowners at this point */ WARN_ON_ONCE(so->so_is_open_owner); remove_blocked_locks(lockowner(so)); } } nfsd4_return_all_client_layouts(clp); nfsd4_shutdown_copy(clp); nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); atomic_add_unless(&nn->nfs4_client_count, -1, 0); nfsd4_dec_courtesy_client_count(nn, clp); free_client(clp); wake_up_all(&expiry_wq); } static void destroy_client(struct nfs4_client *clp) { unhash_client(clp); __destroy_client(clp); } static void inc_reclaim_complete(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (!nn->track_reclaim_completes) return; if (!nfsd4_find_reclaim_client(clp->cl_name, nn)) return; if (atomic_inc_return(&nn->nr_reclaim_complete) == nn->reclaim_str_hashtbl_size) { printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n", clp->net->ns.inum); nfsd4_end_grace(nn); } } static void expire_client(struct nfs4_client *clp) { unhash_client(clp); nfsd4_client_record_remove(clp); __destroy_client(clp); } static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) { memcpy(target->cl_verifier.data, source->data, sizeof(target->cl_verifier.data)); } static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) { target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; target->cl_clientid.cl_id = source->cl_clientid.cl_id; } static int copy_cred(struct svc_cred *target, struct svc_cred *source) { target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL); target->cr_raw_principal = kstrdup(source->cr_raw_principal, GFP_KERNEL); target->cr_targ_princ = kstrdup(source->cr_targ_princ, GFP_KERNEL); if ((source->cr_principal && !target->cr_principal) || (source->cr_raw_principal && !target->cr_raw_principal) || (source->cr_targ_princ && !target->cr_targ_princ)) return -ENOMEM; target->cr_flavor = source->cr_flavor; target->cr_uid = source->cr_uid; target->cr_gid = source->cr_gid; target->cr_group_info = source->cr_group_info; get_group_info(target->cr_group_info); target->cr_gss_mech = source->cr_gss_mech; if (source->cr_gss_mech) gss_mech_get(source->cr_gss_mech); return 0; } static int compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2) { if (o1->len < o2->len) return -1; if (o1->len > o2->len) return 1; return memcmp(o1->data, o2->data, o1->len); } static int same_verf(nfs4_verifier *v1, nfs4_verifier *v2) { return 0 == memcmp(v1->data, v2->data, sizeof(v1->data)); } static int same_clid(clientid_t *cl1, clientid_t *cl2) { return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id); } static bool groups_equal(struct group_info *g1, struct group_info *g2) { int i; if (g1->ngroups != g2->ngroups) return false; for (i=0; i<g1->ngroups; i++) if (!gid_eq(g1->gid[i], g2->gid[i])) return false; return true; } /* * RFC 3530 language requires clid_inuse be returned when the * "principal" associated with a requests differs from that previously * used. We use uid, gid's, and gss principal string as our best * approximation. We also don't want to allow non-gss use of a client * established using gss: in theory cr_principal should catch that * change, but in practice cr_principal can be null even in the gss case * since gssd doesn't always pass down a principal string. */ static bool is_gss_cred(struct svc_cred *cr) { /* Is cr_flavor one of the gss "pseudoflavors"?: */ return (cr->cr_flavor > RPC_AUTH_MAXFLAVOR); } static bool same_creds(struct svc_cred *cr1, struct svc_cred *cr2) { if ((is_gss_cred(cr1) != is_gss_cred(cr2)) || (!uid_eq(cr1->cr_uid, cr2->cr_uid)) || (!gid_eq(cr1->cr_gid, cr2->cr_gid)) || !groups_equal(cr1->cr_group_info, cr2->cr_group_info)) return false; /* XXX: check that cr_targ_princ fields match ? */ if (cr1->cr_principal == cr2->cr_principal) return true; if (!cr1->cr_principal || !cr2->cr_principal) return false; return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); } static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp) { struct svc_cred *cr = &rqstp->rq_cred; u32 service; if (!cr->cr_gss_mech) return false; service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor); return service == RPC_GSS_SVC_INTEGRITY || service == RPC_GSS_SVC_PRIVACY; } bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) { struct svc_cred *cr = &rqstp->rq_cred; if (!cl->cl_mach_cred) return true; if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech) return false; if (!svc_rqst_integrity_protected(rqstp)) return false; if (cl->cl_cred.cr_raw_principal) return 0 == strcmp(cl->cl_cred.cr_raw_principal, cr->cr_raw_principal); if (!cr->cr_principal) return false; return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal); } static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn) { __be32 verf[2]; /* * This is opaque to client, so no need to byte-swap. Use * __force to keep sparse happy */ verf[0] = (__force __be32)(u32)ktime_get_real_seconds(); verf[1] = (__force __be32)nn->clverifier_counter++; memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); } static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) { clp->cl_clientid.cl_boot = (u32)nn->boot_time; clp->cl_clientid.cl_id = nn->clientid_counter++; gen_confirm(clp, nn); } static struct nfs4_stid * find_stateid_locked(struct nfs4_client *cl, stateid_t *t) { struct nfs4_stid *ret; ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id); if (!ret || !ret->sc_type) return NULL; return ret; } static struct nfs4_stid * find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, unsigned short typemask, unsigned short ok_states) { struct nfs4_stid *s; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, t); if (s != NULL) { if ((s->sc_status & ~ok_states) == 0 && (typemask & s->sc_type)) refcount_inc(&s->sc_count); else s = NULL; } spin_unlock(&cl->cl_lock); return s; } static struct nfs4_client *get_nfsdfs_clp(struct inode *inode) { struct nfsdfs_client *nc; nc = get_nfsdfs_client(inode); if (!nc) return NULL; return container_of(nc, struct nfs4_client, cl_nfsdfs); } static void seq_quote_mem(struct seq_file *m, char *data, int len) { seq_puts(m, "\""); seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\"); seq_puts(m, "\""); } static const char *cb_state2str(int state) { switch (state) { case NFSD4_CB_UP: return "UP"; case NFSD4_CB_UNKNOWN: return "UNKNOWN"; case NFSD4_CB_DOWN: return "DOWN"; case NFSD4_CB_FAULT: return "FAULT"; } return "UNDEFINED"; } static int client_info_show(struct seq_file *m, void *v) { struct inode *inode = file_inode(m->file); struct nfs4_client *clp; u64 clid; clp = get_nfsdfs_clp(inode); if (!clp) return -ENXIO; memcpy(&clid, &clp->cl_clientid, sizeof(clid)); seq_printf(m, "clientid: 0x%llx\n", clid); seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr); if (clp->cl_state == NFSD4_COURTESY) seq_puts(m, "status: courtesy\n"); else if (clp->cl_state == NFSD4_EXPIRABLE) seq_puts(m, "status: expirable\n"); else if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) seq_puts(m, "status: confirmed\n"); else seq_puts(m, "status: unconfirmed\n"); seq_printf(m, "seconds from last renew: %lld\n", ktime_get_boottime_seconds() - clp->cl_time); seq_puts(m, "name: "); seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); if (clp->cl_nii_domain.data) { seq_puts(m, "Implementation domain: "); seq_quote_mem(m, clp->cl_nii_domain.data, clp->cl_nii_domain.len); seq_puts(m, "\nImplementation name: "); seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len); seq_printf(m, "\nImplementation time: [%lld, %ld]\n", clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec); } seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state)); seq_printf(m, "callback address: \"%pISpc\"\n", &clp->cl_cb_conn.cb_addr); seq_printf(m, "admin-revoked states: %d\n", atomic_read(&clp->cl_admin_revoked)); drop_client(clp); return 0; } DEFINE_SHOW_ATTRIBUTE(client_info); static void *states_start(struct seq_file *s, loff_t *pos) __acquires(&clp->cl_lock) { struct nfs4_client *clp = s->private; unsigned long id = *pos; void *ret; spin_lock(&clp->cl_lock); ret = idr_get_next_ul(&clp->cl_stateids, &id); *pos = id; return ret; } static void *states_next(struct seq_file *s, void *v, loff_t *pos) { struct nfs4_client *clp = s->private; unsigned long id = *pos; void *ret; id = *pos; id++; ret = idr_get_next_ul(&clp->cl_stateids, &id); *pos = id; return ret; } static void states_stop(struct seq_file *s, void *v) __releases(&clp->cl_lock) { struct nfs4_client *clp = s->private; spin_unlock(&clp->cl_lock); } static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f) { seq_printf(s, "filename: \"%pD2\"", f->nf_file); } static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) { struct inode *inode = file_inode(f->nf_file); seq_printf(s, "superblock: \"%02x:%02x:%ld\"", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); } static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo) { seq_puts(s, "owner: "); seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len); } static void nfs4_show_stateid(struct seq_file *s, stateid_t *stid) { seq_printf(s, "0x%.8x", stid->si_generation); seq_printf(s, "%12phN", &stid->si_opaque); } static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; struct nfsd_file *file; struct nfs4_stateowner *oo; unsigned int access, deny; ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); seq_puts(s, ": { type: open, "); access = bmap_to_share_mode(ols->st_access_bmap); deny = bmap_to_share_mode(ols->st_deny_bmap); seq_printf(s, "access: %s%s, ", access & NFS4_SHARE_ACCESS_READ ? "r" : "-", access & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); seq_printf(s, "deny: %s%s, ", deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); if (nf) { spin_lock(&nf->fi_lock); file = find_any_file_locked(nf); if (file) { nfs4_show_superblock(s, file); seq_puts(s, ", "); nfs4_show_fname(s, file); seq_puts(s, ", "); } spin_unlock(&nf->fi_lock); } else seq_puts(s, "closed, "); nfs4_show_owner(s, oo); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); return 0; } static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; struct nfsd_file *file; struct nfs4_stateowner *oo; ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); seq_puts(s, ": { type: lock, "); spin_lock(&nf->fi_lock); file = find_any_file_locked(nf); if (file) { /* * Note: a lock stateid isn't really the same thing as a lock, * it's the locking state held by one owner on a file, and there * may be multiple (or no) lock ranges associated with it. * (Same for the matter is true of open stateids.) */ nfs4_show_superblock(s, file); /* XXX: open stateid? */ seq_puts(s, ", "); nfs4_show_fname(s, file); seq_puts(s, ", "); } nfs4_show_owner(s, oo); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); spin_unlock(&nf->fi_lock); return 0; } static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_delegation *ds; struct nfs4_file *nf; struct nfsd_file *file; ds = delegstateid(st); nf = st->sc_file; seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); seq_puts(s, ": { type: deleg, "); seq_printf(s, "access: %s", ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w"); /* XXX: lease time, whether it's being recalled. */ spin_lock(&nf->fi_lock); file = nf->fi_deleg_file; if (file) { seq_puts(s, ", "); nfs4_show_superblock(s, file); seq_puts(s, ", "); nfs4_show_fname(s, file); } spin_unlock(&nf->fi_lock); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); return 0; } static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_layout_stateid *ls; struct nfsd_file *file; ls = container_of(st, struct nfs4_layout_stateid, ls_stid); seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); seq_puts(s, ": { type: layout"); /* XXX: What else would be useful? */ spin_lock(&ls->ls_stid.sc_file->fi_lock); file = ls->ls_file; if (file) { seq_puts(s, ", "); nfs4_show_superblock(s, file); seq_puts(s, ", "); nfs4_show_fname(s, file); } spin_unlock(&ls->ls_stid.sc_file->fi_lock); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); return 0; } static int states_show(struct seq_file *s, void *v) { struct nfs4_stid *st = v; switch (st->sc_type) { case SC_TYPE_OPEN: return nfs4_show_open(s, st); case SC_TYPE_LOCK: return nfs4_show_lock(s, st); case SC_TYPE_DELEG: return nfs4_show_deleg(s, st); case SC_TYPE_LAYOUT: return nfs4_show_layout(s, st); default: return 0; /* XXX: or SEQ_SKIP? */ } /* XXX: copy stateids? */ } static struct seq_operations states_seq_ops = { .start = states_start, .next = states_next, .stop = states_stop, .show = states_show }; static int client_states_open(struct inode *inode, struct file *file) { struct seq_file *s; struct nfs4_client *clp; int ret; clp = get_nfsdfs_clp(inode); if (!clp) return -ENXIO; ret = seq_open(file, &states_seq_ops); if (ret) return ret; s = file->private_data; s->private = clp; return 0; } static int client_opens_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct nfs4_client *clp = m->private; /* XXX: alternatively, we could get/drop in seq start/stop */ drop_client(clp); return seq_release(inode, file); } static const struct file_operations client_states_fops = { .open = client_states_open, .read = seq_read, .llseek = seq_lseek, .release = client_opens_release, }; /* * Normally we refuse to destroy clients that are in use, but here the * administrator is telling us to just do it. We also want to wait * so the caller has a guarantee that the client's locks are gone by * the time the write returns: */ static void force_expire_client(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); bool already_expired; trace_nfsd_clid_admin_expired(&clp->cl_clientid); spin_lock(&nn->client_lock); clp->cl_time = 0; spin_unlock(&nn->client_lock); wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0); spin_lock(&nn->client_lock); already_expired = list_empty(&clp->cl_lru); if (!already_expired) unhash_client_locked(clp); spin_unlock(&nn->client_lock); if (!already_expired) expire_client(clp); else wait_event(expiry_wq, clp->cl_nfsd_dentry == NULL); } static ssize_t client_ctl_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) { char *data; struct nfs4_client *clp; data = simple_transaction_get(file, buf, size); if (IS_ERR(data)) return PTR_ERR(data); if (size != 7 || 0 != memcmp(data, "expire\n", 7)) return -EINVAL; clp = get_nfsdfs_clp(file_inode(file)); if (!clp) return -ENXIO; force_expire_client(clp); drop_client(clp); return 7; } static const struct file_operations client_ctl_fops = { .write = client_ctl_write, .release = simple_transaction_release, }; static const struct tree_descr client_files[] = { [0] = {"info", &client_info_fops, S_IRUSR}, [1] = {"states", &client_states_fops, S_IRUSR}, [2] = {"ctl", &client_ctl_fops, S_IWUSR}, [3] = {""}, }; static int nfsd4_cb_recall_any_done(struct nfsd4_callback *cb, struct rpc_task *task) { trace_nfsd_cb_recall_any_done(cb, task); switch (task->tk_status) { case -NFS4ERR_DELAY: rpc_delay(task, 2 * HZ); return 0; default: return 1; } } static void nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); drop_client(clp); } static int nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task) { struct nfs4_cb_fattr *ncf = container_of(cb, struct nfs4_cb_fattr, ncf_getattr); struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); trace_nfsd_cb_getattr_done(&dp->dl_stid.sc_stateid, task); ncf->ncf_cb_status = task->tk_status; switch (task->tk_status) { case -NFS4ERR_DELAY: rpc_delay(task, 2 * HZ); return 0; default: return 1; } } static void nfsd4_cb_getattr_release(struct nfsd4_callback *cb) { struct nfs4_cb_fattr *ncf = container_of(cb, struct nfs4_cb_fattr, ncf_getattr); struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); clear_and_wake_up_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags); nfs4_put_stid(&dp->dl_stid); } static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { .done = nfsd4_cb_recall_any_done, .release = nfsd4_cb_recall_any_release, .opcode = OP_CB_RECALL_ANY, }; static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = { .done = nfsd4_cb_getattr_done, .release = nfsd4_cb_getattr_release, .opcode = OP_CB_GETATTR, }; static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf) { struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags)) return; /* set to proper status when nfsd4_cb_getattr_done runs */ ncf->ncf_cb_status = NFS4ERR_IO; refcount_inc(&dp->dl_stid.sc_count); nfsd4_run_cb(&ncf->ncf_getattr); } static struct nfs4_client *create_client(struct xdr_netobj name, struct svc_rqst *rqstp, nfs4_verifier *verf) { struct nfs4_client *clp; struct sockaddr *sa = svc_addr(rqstp); int ret; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct dentry *dentries[ARRAY_SIZE(client_files)]; clp = alloc_client(name, nn); if (clp == NULL) return NULL; ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); if (ret) { free_client(clp); return NULL; } gen_clid(clp, nn); kref_init(&clp->cl_nfsdfs.cl_ref); nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL); clp->cl_time = ktime_get_boottime_seconds(); copy_verf(clp, verf); memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage)); clp->cl_cb_session = NULL; clp->net = net; clp->cl_nfsd_dentry = nfsd_client_mkdir( nn, &clp->cl_nfsdfs, clp->cl_clientid.cl_id - nn->clientid_base, client_files, dentries); clp->cl_nfsd_info_dentry = dentries[0]; if (!clp->cl_nfsd_dentry) { free_client(clp); return NULL; } clp->cl_ra = kzalloc(sizeof(*clp->cl_ra), GFP_KERNEL); if (!clp->cl_ra) { free_client(clp); return NULL; } clp->cl_ra_time = 0; nfsd4_init_cb(&clp->cl_ra->ra_cb, clp, &nfsd4_cb_recall_any_ops, NFSPROC4_CLNT_CB_RECALL_ANY); return clp; } static void add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root) { struct rb_node **new = &(root->rb_node), *parent = NULL; struct nfs4_client *clp; while (*new) { clp = rb_entry(*new, struct nfs4_client, cl_namenode); parent = *new; if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0) new = &((*new)->rb_left); else new = &((*new)->rb_right); } rb_link_node(&new_clp->cl_namenode, parent, new); rb_insert_color(&new_clp->cl_namenode, root); } static struct nfs4_client * find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root) { int cmp; struct rb_node *node = root->rb_node; struct nfs4_client *clp; while (node) { clp = rb_entry(node, struct nfs4_client, cl_namenode); cmp = compare_blob(&clp->cl_name, name); if (cmp > 0) node = node->rb_left; else if (cmp < 0) node = node->rb_right; else return clp; } return NULL; } static void add_to_unconfirmed(struct nfs4_client *clp) { unsigned int idhashval; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); add_clp_to_name_tree(clp, &nn->unconf_name_tree); idhashval = clientid_hashval(clp->cl_clientid.cl_id); list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]); renew_client_locked(clp); } static void move_to_confirmed(struct nfs4_client *clp) { unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); add_clp_to_name_tree(clp, &nn->conf_name_tree); set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); trace_nfsd_clid_confirmed(&clp->cl_clientid); renew_client_locked(clp); } static struct nfs4_client * find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions) { struct nfs4_client *clp; unsigned int idhashval = clientid_hashval(clid->cl_id); list_for_each_entry(clp, &tbl[idhashval], cl_idhash) { if (same_clid(&clp->cl_clientid, clid)) { if ((bool)clp->cl_minorversion != sessions) return NULL; renew_client_locked(clp); return clp; } } return NULL; } static struct nfs4_client * find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct list_head *tbl = nn->conf_id_hashtbl; lockdep_assert_held(&nn->client_lock); return find_client_in_id_table(tbl, clid, sessions); } static struct nfs4_client * find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct list_head *tbl = nn->unconf_id_hashtbl; lockdep_assert_held(&nn->client_lock); return find_client_in_id_table(tbl, clid, sessions); } static bool clp_used_exchangeid(struct nfs4_client *clp) { return clp->cl_exchange_flags != 0; } static struct nfs4_client * find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) { lockdep_assert_held(&nn->client_lock); return find_clp_in_name_tree(name, &nn->conf_name_tree); } static struct nfs4_client * find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) { lockdep_assert_held(&nn->client_lock); return find_clp_in_name_tree(name, &nn->unconf_name_tree); } static void gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) { struct nfs4_cb_conn *conn = &clp->cl_cb_conn; struct sockaddr *sa = svc_addr(rqstp); u32 scopeid = rpc_get_scope_id(sa); unsigned short expected_family; /* Currently, we only support tcp and tcp6 for the callback channel */ if (se->se_callback_netid_len == 3 && !memcmp(se->se_callback_netid_val, "tcp", 3)) expected_family = AF_INET; else if (se->se_callback_netid_len == 4 && !memcmp(se->se_callback_netid_val, "tcp6", 4)) expected_family = AF_INET6; else goto out_err; conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val, se->se_callback_addr_len, (struct sockaddr *)&conn->cb_addr, sizeof(conn->cb_addr)); if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family) goto out_err; if (conn->cb_addr.ss_family == AF_INET6) ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid; conn->cb_prog = se->se_callback_prog; conn->cb_ident = se->se_callback_ident; memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen); trace_nfsd_cb_args(clp, conn); return; out_err: conn->cb_addr.ss_family = AF_UNSPEC; conn->cb_addrlen = 0; trace_nfsd_cb_nodelegs(clp); return; } /* * Cache a reply. nfsd4_check_resp_size() has bounded the cache size. */ static void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { struct xdr_buf *buf = resp->xdr->buf; struct nfsd4_slot *slot = resp->cstate.slot; unsigned int base; dprintk("--> %s slot %p\n", __func__, slot); slot->sl_flags |= NFSD4_SLOT_INITIALIZED; slot->sl_opcnt = resp->opcnt; slot->sl_status = resp->cstate.status; free_svc_cred(&slot->sl_cred); copy_cred(&slot->sl_cred, &resp->rqstp->rq_cred); if (!nfsd4_cache_this(resp)) { slot->sl_flags &= ~NFSD4_SLOT_CACHED; return; } slot->sl_flags |= NFSD4_SLOT_CACHED; base = resp->cstate.data_offset; slot->sl_datalen = buf->len - base; if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) WARN(1, "%s: sessions DRC could not cache compound\n", __func__); return; } /* * Encode the replay sequence operation from the slot values. * If cachethis is FALSE encode the uncached rep error on the next * operation which sets resp->p and increments resp->opcnt for * nfs4svc_encode_compoundres. * */ static __be32 nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, struct nfsd4_compoundres *resp) { struct nfsd4_op *op; struct nfsd4_slot *slot = resp->cstate.slot; /* Encode the replayed sequence operation */ op = &args->ops[resp->opcnt - 1]; nfsd4_encode_operation(resp, op); if (slot->sl_flags & NFSD4_SLOT_CACHED) return op->status; if (args->opcnt == 1) { /* * The original operation wasn't a solo sequence--we * always cache those--so this retry must not match the * original: */ op->status = nfserr_seq_false_retry; } else { op = &args->ops[resp->opcnt++]; op->status = nfserr_retry_uncached_rep; nfsd4_encode_operation(resp, op); } return op->status; } /* * The sequence operation is not cached because we can use the slot and * session values. */ static __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { struct nfsd4_slot *slot = resp->cstate.slot; struct xdr_stream *xdr = resp->xdr; __be32 *p; __be32 status; dprintk("--> %s slot %p\n", __func__, slot); status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); if (status) return status; p = xdr_reserve_space(xdr, slot->sl_datalen); if (!p) { WARN_ON_ONCE(1); return nfserr_serverfault; } xdr_encode_opaque_fixed(p, slot->sl_data, slot->sl_datalen); xdr_commit_encode(xdr); resp->opcnt = slot->sl_opcnt; return slot->sl_status; } /* * Set the exchange_id flags returned by the server. */ static void nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) { #ifdef CONFIG_NFSD_PNFS new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS; #else new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; #endif /* Referrals are supported, Migration is not. */ new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; /* set the wire flags to return to client. */ clid->flags = new->cl_exchange_flags; } static bool client_has_openowners(struct nfs4_client *clp) { struct nfs4_openowner *oo; list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) { if (!list_empty(&oo->oo_owner.so_stateids)) return true; } return false; } static bool client_has_state(struct nfs4_client *clp) { return client_has_openowners(clp) #ifdef CONFIG_NFSD_PNFS || !list_empty(&clp->cl_lo_states) #endif || !list_empty(&clp->cl_delegations) || !list_empty(&clp->cl_sessions) || nfsd4_has_active_async_copies(clp); } static __be32 copy_impl_id(struct nfs4_client *clp, struct nfsd4_exchange_id *exid) { if (!exid->nii_domain.data) return 0; xdr_netobj_dup(&clp->cl_nii_domain, &exid->nii_domain, GFP_KERNEL); if (!clp->cl_nii_domain.data) return nfserr_jukebox; xdr_netobj_dup(&clp->cl_nii_name, &exid->nii_name, GFP_KERNEL); if (!clp->cl_nii_name.data) return nfserr_jukebox; clp->cl_nii_time = exid->nii_time; return 0; } __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_exchange_id *exid = &u->exchange_id; struct nfs4_client *conf, *new; struct nfs4_client *unconf = NULL; __be32 status; char addr_str[INET6_ADDRSTRLEN]; nfs4_verifier verf = exid->verifier; struct sockaddr *sa = svc_addr(rqstp); bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); rpc_ntop(sa, addr_str, sizeof(addr_str)); dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " "ip_addr=%s flags %x, spa_how %u\n", __func__, rqstp, exid, exid->clname.len, exid->clname.data, addr_str, exid->flags, exid->spa_how); exid->server_impl_name = kasprintf(GFP_KERNEL, "%s %s %s %s", utsname()->sysname, utsname()->release, utsname()->version, utsname()->machine); if (!exid->server_impl_name) return nfserr_jukebox; if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; new = create_client(exid->clname, rqstp, &verf); if (new == NULL) return nfserr_jukebox; status = copy_impl_id(new, exid); if (status) goto out_nolock; switch (exid->spa_how) { case SP4_MACH_CRED: exid->spo_must_enforce[0] = 0; exid->spo_must_enforce[1] = ( 1 << (OP_BIND_CONN_TO_SESSION - 32) | 1 << (OP_EXCHANGE_ID - 32) | 1 << (OP_CREATE_SESSION - 32) | 1 << (OP_DESTROY_SESSION - 32) | 1 << (OP_DESTROY_CLIENTID - 32)); exid->spo_must_allow[0] &= (1 << (OP_CLOSE) | 1 << (OP_OPEN_DOWNGRADE) | 1 << (OP_LOCKU) | 1 << (OP_DELEGRETURN)); exid->spo_must_allow[1] &= ( 1 << (OP_TEST_STATEID - 32) | 1 << (OP_FREE_STATEID - 32)); if (!svc_rqst_integrity_protected(rqstp)) { status = nfserr_inval; goto out_nolock; } /* * Sometimes userspace doesn't give us a principal. * Which is a bug, really. Anyway, we can't enforce * MACH_CRED in that case, better to give up now: */ if (!new->cl_cred.cr_principal && !new->cl_cred.cr_raw_principal) { status = nfserr_serverfault; goto out_nolock; } new->cl_mach_cred = true; break; case SP4_NONE: break; default: /* checked by xdr code */ WARN_ON_ONCE(1); fallthrough; case SP4_SSV: status = nfserr_encr_alg_unsupp; goto out_nolock; } /* Cases below refer to rfc 5661 section 18.35.4: */ spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&exid->clname, nn); if (conf) { bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); bool verfs_match = same_verf(&verf, &conf->cl_verifier); if (update) { if (!clp_used_exchangeid(conf)) { /* buggy client */ status = nfserr_inval; goto out; } if (!nfsd4_mach_creds_match(conf, rqstp)) { status = nfserr_wrong_cred; goto out; } if (!creds_match) { /* case 9 */ status = nfserr_perm; goto out; } if (!verfs_match) { /* case 8 */ status = nfserr_not_same; goto out; } /* case 6 */ exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; trace_nfsd_clid_confirmed_r(conf); goto out_copy; } if (!creds_match) { /* case 3 */ if (client_has_state(conf)) { status = nfserr_clid_inuse; trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } goto out_new; } if (verfs_match) { /* case 2 */ conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; trace_nfsd_clid_confirmed_r(conf); goto out_copy; } /* case 5, client reboot */ trace_nfsd_clid_verf_mismatch(conf, rqstp, &verf); conf = NULL; goto out_new; } if (update) { /* case 7 */ status = nfserr_noent; goto out; } unconf = find_unconfirmed_client_by_name(&exid->clname, nn); if (unconf) /* case 4, possible retry or client restart */ unhash_client_locked(unconf); /* case 1, new owner ID */ trace_nfsd_clid_fresh(new); out_new: if (conf) { status = mark_client_expired_locked(conf); if (status) goto out; trace_nfsd_clid_replaced(&conf->cl_clientid); } new->cl_minorversion = cstate->minorversion; new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0]; new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1]; /* Contrived initial CREATE_SESSION response */ new->cl_cs_slot.sl_status = nfserr_seq_misordered; add_to_unconfirmed(new); swap(new, conf); out_copy: exid->clientid.cl_boot = conf->cl_clientid.cl_boot; exid->clientid.cl_id = conf->cl_clientid.cl_id; exid->seqid = conf->cl_cs_slot.sl_seqid + 1; nfsd4_set_ex_flags(conf, exid); exid->nii_domain.len = sizeof("kernel.org") - 1; exid->nii_domain.data = "kernel.org"; /* * Note that RFC 8881 places no length limit on * nii_name, but this implementation permits no * more than NFS4_OPAQUE_LIMIT bytes. */ exid->nii_name.len = strlen(exid->server_impl_name); if (exid->nii_name.len > NFS4_OPAQUE_LIMIT) exid->nii_name.len = NFS4_OPAQUE_LIMIT; exid->nii_name.data = exid->server_impl_name; /* just send zeros - the date is in nii_name */ exid->nii_time.tv_sec = 0; exid->nii_time.tv_nsec = 0; dprintk("nfsd4_exchange_id seqid %d flags %x\n", conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags); status = nfs_ok; out: spin_unlock(&nn->client_lock); out_nolock: if (new) expire_client(new); if (unconf) { trace_nfsd_clid_expire_unconf(&unconf->cl_clientid); expire_client(unconf); } return status; } void nfsd4_exchange_id_release(union nfsd4_op_u *u) { struct nfsd4_exchange_id *exid = &u->exchange_id; kfree(exid->server_impl_name); } static __be32 check_slot_seqid(u32 seqid, u32 slot_seqid, bool slot_inuse) { /* The slot is in use, and no response has been sent. */ if (slot_inuse) { if (seqid == slot_seqid) return nfserr_jukebox; else return nfserr_seq_misordered; } /* Note unsigned 32-bit arithmetic handles wraparound: */ if (likely(seqid == slot_seqid + 1)) return nfs_ok; if (seqid == slot_seqid) return nfserr_replay_cache; return nfserr_seq_misordered; } /* * Cache the create session result into the create session single DRC * slot cache by saving the xdr structure. sl_seqid has been set. * Do this for solo or embedded create session operations. */ static void nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses, struct nfsd4_clid_slot *slot, __be32 nfserr) { slot->sl_status = nfserr; memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses)); } static __be32 nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, struct nfsd4_clid_slot *slot) { memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses)); return slot->sl_status; } #define NFSD_MIN_REQ_HDR_SEQ_SZ ((\ 2 * 2 + /* credential,verifier: AUTH_NULL, length 0 */ \ 1 + /* MIN tag is length with zero, only length */ \ 3 + /* version, opcount, opcode */ \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ /* seqid, slotID, slotID, cache */ \ 4 ) * sizeof(__be32)) #define NFSD_MIN_RESP_HDR_SEQ_SZ ((\ 2 + /* verifier: AUTH_NULL, length 0 */\ 1 + /* status */ \ 1 + /* MIN tag is length with zero, only length */ \ 3 + /* opcount, opcode, opstatus*/ \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ /* seqid, slotID, slotID, slotID, status */ \ 5 ) * sizeof(__be32)) static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) { u32 maxrpc = nn->nfsd_serv->sv_max_mesg; if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ) return nfserr_toosmall; if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ) return nfserr_toosmall; ca->headerpadsz = 0; ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc); ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc); ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND); ca->maxresp_cached = min_t(u32, ca->maxresp_cached, NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ); ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION); /* * Note decreasing slot size below client's request may make it * difficult for client to function correctly, whereas * decreasing the number of slots will (just?) affect * performance. When short on memory we therefore prefer to * decrease number of slots instead of their size. Clients that * request larger slots than they need will get poor results: * Note that we always allow at least one slot, because our * accounting is soft and provides no guarantees either way. */ ca->maxreqs = nfsd4_get_drc_mem(ca, nn); return nfs_ok; } /* * Server's NFSv4.1 backchannel support is AUTH_SYS-only for now. * These are based on similar macros in linux/sunrpc/msg_prot.h . */ #define RPC_MAX_HEADER_WITH_AUTH_SYS \ (RPC_CALLHDRSIZE + 2 * (2 + UNX_CALLSLACK)) #define RPC_MAX_REPHEADER_WITH_AUTH_SYS \ (RPC_REPHDRSIZE + (2 + NUL_REPLYSLACK)) #define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \ RPC_MAX_HEADER_WITH_AUTH_SYS) * sizeof(__be32)) #define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \ RPC_MAX_REPHEADER_WITH_AUTH_SYS) * \ sizeof(__be32)) static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) { ca->headerpadsz = 0; if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ) return nfserr_toosmall; if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ) return nfserr_toosmall; ca->maxresp_cached = 0; if (ca->maxops < 2) return nfserr_toosmall; return nfs_ok; } static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs) { switch (cbs->flavor) { case RPC_AUTH_NULL: case RPC_AUTH_UNIX: return nfs_ok; default: /* * GSS case: the spec doesn't allow us to return this * error. But it also doesn't allow us not to support * GSS. * I'd rather this fail hard than return some error the * client might think it can already handle: */ return nfserr_encr_alg_unsupp; } } __be32 nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_create_session *cr_ses = &u->create_session; struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; struct nfsd4_clid_slot *cs_slot; struct nfs4_client *old = NULL; struct nfsd4_session *new; struct nfsd4_conn *conn; __be32 status = 0; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) return nfserr_inval; status = nfsd4_check_cb_sec(&cr_ses->cb_sec); if (status) return status; status = check_forechannel_attrs(&cr_ses->fore_channel, nn); if (status) return status; status = check_backchannel_attrs(&cr_ses->back_channel); if (status) goto out_release_drc_mem; status = nfserr_jukebox; new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel); if (!new) goto out_release_drc_mem; conn = alloc_conn_from_crses(rqstp, cr_ses); if (!conn) goto out_free_session; spin_lock(&nn->client_lock); /* RFC 8881 Section 18.36.4 Phase 1: Client record look-up. */ unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); conf = find_confirmed_client(&cr_ses->clientid, true, nn); if (!conf && !unconf) { status = nfserr_stale_clientid; goto out_free_conn; } /* RFC 8881 Section 18.36.4 Phase 2: Sequence ID processing. */ if (conf) { cs_slot = &conf->cl_cs_slot; trace_nfsd_slot_seqid_conf(conf, cr_ses); } else { cs_slot = &unconf->cl_cs_slot; trace_nfsd_slot_seqid_unconf(unconf, cr_ses); } status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); switch (status) { case nfs_ok: cs_slot->sl_seqid++; cr_ses->seqid = cs_slot->sl_seqid; break; case nfserr_replay_cache: status = nfsd4_replay_create_session(cr_ses, cs_slot); fallthrough; case nfserr_jukebox: /* The server MUST NOT cache NFS4ERR_DELAY */ goto out_free_conn; default: goto out_cache_error; } /* RFC 8881 Section 18.36.4 Phase 3: Client ID confirmation. */ if (conf) { status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(conf, rqstp)) goto out_cache_error; } else { status = nfserr_clid_inuse; if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { trace_nfsd_clid_cred_mismatch(unconf, rqstp); goto out_cache_error; } status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(unconf, rqstp)) goto out_cache_error; old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = mark_client_expired_locked(old); if (status) goto out_expired_error; trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; } /* RFC 8881 Section 18.36.4 Phase 4: Session creation. */ status = nfs_ok; /* Persistent sessions are not supported */ cr_ses->flags &= ~SESSION4_PERSIST; /* Upshifting from TCP to RDMA is not supported */ cr_ses->flags &= ~SESSION4_RDMA; /* Report the correct number of backchannel slots */ cr_ses->back_channel.maxreqs = new->se_cb_highest_slot + 1; init_session(rqstp, new, conf, cr_ses); nfsd4_get_session_locked(new); memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); /* cache solo and embedded create sessions under the client_lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); spin_unlock(&nn->client_lock); if (conf == unconf) fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); /* init connection and backchannel */ nfsd4_init_conn(rqstp, conn, new); nfsd4_put_session(new); if (old) expire_client(old); return status; out_expired_error: /* * Revert the slot seq_nr change so the server will process * the client's resend instead of returning a cached response. */ if (status == nfserr_jukebox) { cs_slot->sl_seqid--; cr_ses->seqid = cs_slot->sl_seqid; goto out_free_conn; } out_cache_error: nfsd4_cache_create_session(cr_ses, cs_slot, status); out_free_conn: spin_unlock(&nn->client_lock); free_conn(conn); out_free_session: __free_session(new); out_release_drc_mem: nfsd4_put_drc_mem(&cr_ses->fore_channel); return status; } static __be32 nfsd4_map_bcts_dir(u32 *dir) { switch (*dir) { case NFS4_CDFC4_FORE: case NFS4_CDFC4_BACK: return nfs_ok; case NFS4_CDFC4_FORE_OR_BOTH: case NFS4_CDFC4_BACK_OR_BOTH: *dir = NFS4_CDFC4_BOTH; return nfs_ok; } return nfserr_inval; } __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl; struct nfsd4_session *session = cstate->session; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); __be32 status; status = nfsd4_check_cb_sec(&bc->bc_cb_sec); if (status) return status; spin_lock(&nn->client_lock); session->se_cb_prog = bc->bc_cb_program; session->se_cb_sec = bc->bc_cb_sec; spin_unlock(&nn->client_lock); nfsd4_probe_callback(session->se_client); return nfs_ok; } static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s) { struct nfsd4_conn *c; list_for_each_entry(c, &s->se_conns, cn_persession) { if (c->cn_xprt == xpt) { return c; } } return NULL; } static __be32 nfsd4_match_existing_connection(struct svc_rqst *rqst, struct nfsd4_session *session, u32 req, struct nfsd4_conn **conn) { struct nfs4_client *clp = session->se_client; struct svc_xprt *xpt = rqst->rq_xprt; struct nfsd4_conn *c; __be32 status; /* Following the last paragraph of RFC 5661 Section 18.34.3: */ spin_lock(&clp->cl_lock); c = __nfsd4_find_conn(xpt, session); if (!c) status = nfserr_noent; else if (req == c->cn_flags) status = nfs_ok; else if (req == NFS4_CDFC4_FORE_OR_BOTH && c->cn_flags != NFS4_CDFC4_BACK) status = nfs_ok; else if (req == NFS4_CDFC4_BACK_OR_BOTH && c->cn_flags != NFS4_CDFC4_FORE) status = nfs_ok; else status = nfserr_inval; spin_unlock(&clp->cl_lock); if (status == nfs_ok && conn) *conn = c; return status; } __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; __be32 status; struct nfsd4_conn *conn; struct nfsd4_session *session; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nfsd4_last_compound_op(rqstp)) return nfserr_not_only_op; spin_lock(&nn->client_lock); session = find_in_sessionid_hashtbl(&bcts->sessionid, net, &status); spin_unlock(&nn->client_lock); if (!session) goto out_no_session; status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(session->se_client, rqstp)) goto out; status = nfsd4_match_existing_connection(rqstp, session, bcts->dir, &conn); if (status == nfs_ok) { if (bcts->dir == NFS4_CDFC4_FORE_OR_BOTH || bcts->dir == NFS4_CDFC4_BACK) conn->cn_flags |= NFS4_CDFC4_BACK; nfsd4_probe_callback(session->se_client); goto out; } if (status == nfserr_inval) goto out; status = nfsd4_map_bcts_dir(&bcts->dir); if (status) goto out; conn = alloc_conn(rqstp, bcts->dir); status = nfserr_jukebox; if (!conn) goto out; nfsd4_init_conn(rqstp, conn, session); status = nfs_ok; out: nfsd4_put_session(session); out_no_session: return status; } static bool nfsd4_compound_in_session(struct nfsd4_compound_state *cstate, struct nfs4_sessionid *sid) { if (!cstate->session) return false; return !memcmp(sid, &cstate->session->se_sessionid, sizeof(*sid)); } __be32 nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfs4_sessionid *sessionid = &u->destroy_session.sessionid; struct nfsd4_session *ses; __be32 status; int ref_held_by_me = 0; struct net *net = SVC_NET(r); struct nfsd_net *nn = net_generic(net, nfsd_net_id); status = nfserr_not_only_op; if (nfsd4_compound_in_session(cstate, sessionid)) { if (!nfsd4_last_compound_op(r)) goto out; ref_held_by_me++; } dump_sessionid(__func__, sessionid); spin_lock(&nn->client_lock); ses = find_in_sessionid_hashtbl(sessionid, net, &status); if (!ses) goto out_client_lock; status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(ses->se_client, r)) goto out_put_session; status = mark_session_dead_locked(ses, 1 + ref_held_by_me); if (status) goto out_put_session; unhash_session(ses); spin_unlock(&nn->client_lock); nfsd4_probe_callback_sync(ses->se_client); spin_lock(&nn->client_lock); status = nfs_ok; out_put_session: nfsd4_put_session_locked(ses); out_client_lock: spin_unlock(&nn->client_lock); out: return status; } static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; struct nfsd4_conn *c; __be32 status = nfs_ok; int ret; spin_lock(&clp->cl_lock); c = __nfsd4_find_conn(new->cn_xprt, ses); if (c) goto out_free; status = nfserr_conn_not_bound_to_session; if (clp->cl_mach_cred) goto out_free; __nfsd4_hash_conn(new, ses); spin_unlock(&clp->cl_lock); ret = nfsd4_register_conn(new); if (ret) /* oops; xprt is already down: */ nfsd4_conn_lost(&new->cn_xpt_user); return nfs_ok; out_free: spin_unlock(&clp->cl_lock); free_conn(new); return status; } static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session) { struct nfsd4_compoundargs *args = rqstp->rq_argp; return args->opcnt > session->se_fchannel.maxops; } static bool nfsd4_request_too_big(struct svc_rqst *rqstp, struct nfsd4_session *session) { struct xdr_buf *xb = &rqstp->rq_arg; return xb->len > session->se_fchannel.maxreq_sz; } static bool replay_matches_cache(struct svc_rqst *rqstp, struct nfsd4_sequence *seq, struct nfsd4_slot *slot) { struct nfsd4_compoundargs *argp = rqstp->rq_argp; if ((bool)(slot->sl_flags & NFSD4_SLOT_CACHETHIS) != (bool)seq->cachethis) return false; /* * If there's an error then the reply can have fewer ops than * the call. */ if (slot->sl_opcnt < argp->opcnt && !slot->sl_status) return false; /* * But if we cached a reply with *more* ops than the call you're * sending us now, then this new call is clearly not really a * replay of the old one: */ if (slot->sl_opcnt > argp->opcnt) return false; /* This is the only check explicitly called by spec: */ if (!same_creds(&rqstp->rq_cred, &slot->sl_cred)) return false; /* * There may be more comparisons we could actually do, but the * spec doesn't require us to catch every case where the calls * don't match (that would require caching the call as well as * the reply), so we don't bother. */ return true; } __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_sequence *seq = &u->sequence; struct nfsd4_compoundres *resp = rqstp->rq_resp; struct xdr_stream *xdr = resp->xdr; struct nfsd4_session *session; struct nfs4_client *clp; struct nfsd4_slot *slot; struct nfsd4_conn *conn; __be32 status; int buflen; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (resp->opcnt != 1) return nfserr_sequence_pos; /* * Will be either used or freed by nfsd4_sequence_check_conn * below. */ conn = alloc_conn(rqstp, NFS4_CDFC4_FORE); if (!conn) return nfserr_jukebox; spin_lock(&nn->client_lock); session = find_in_sessionid_hashtbl(&seq->sessionid, net, &status); if (!session) goto out_no_session; clp = session->se_client; status = nfserr_too_many_ops; if (nfsd4_session_too_many_ops(rqstp, session)) goto out_put_session; status = nfserr_req_too_big; if (nfsd4_request_too_big(rqstp, session)) goto out_put_session; status = nfserr_badslot; if (seq->slotid >= session->se_fchannel.maxreqs) goto out_put_session; slot = session->se_slots[seq->slotid]; dprintk("%s: slotid %d\n", __func__, seq->slotid); /* We do not negotiate the number of slots yet, so set the * maxslots to the session maxreqs which is used to encode * sr_highest_slotid and the sr_target_slot id to maxslots */ seq->maxslots = session->se_fchannel.maxreqs; trace_nfsd_slot_seqid_sequence(clp, seq, slot); status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_flags & NFSD4_SLOT_INUSE); if (status == nfserr_replay_cache) { status = nfserr_seq_misordered; if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED)) goto out_put_session; status = nfserr_seq_false_retry; if (!replay_matches_cache(rqstp, seq, slot)) goto out_put_session; cstate->slot = slot; cstate->session = session; cstate->clp = clp; /* Return the cached reply status and set cstate->status * for nfsd4_proc_compound processing */ status = nfsd4_replay_cache_entry(resp, seq); cstate->status = nfserr_replay_cache; goto out; } if (status) goto out_put_session; status = nfsd4_sequence_check_conn(conn, session); conn = NULL; if (status) goto out_put_session; buflen = (seq->cachethis) ? session->se_fchannel.maxresp_cached : session->se_fchannel.maxresp_sz; status = (seq->cachethis) ? nfserr_rep_too_big_to_cache : nfserr_rep_too_big; if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack)) goto out_put_session; svc_reserve(rqstp, buflen); status = nfs_ok; /* Success! bump slot seqid */ slot->sl_seqid = seq->seqid; slot->sl_flags |= NFSD4_SLOT_INUSE; if (seq->cachethis) slot->sl_flags |= NFSD4_SLOT_CACHETHIS; else slot->sl_flags &= ~NFSD4_SLOT_CACHETHIS; cstate->slot = slot; cstate->session = session; cstate->clp = clp; out: switch (clp->cl_cb_state) { case NFSD4_CB_DOWN: seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; break; case NFSD4_CB_FAULT: seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; break; default: seq->status_flags = 0; } if (!list_empty(&clp->cl_revoked)) seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; if (atomic_read(&clp->cl_admin_revoked)) seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; trace_nfsd_seq4_status(rqstp, seq); out_no_session: if (conn) free_conn(conn); spin_unlock(&nn->client_lock); return status; out_put_session: nfsd4_put_session_locked(session); goto out_no_session; } void nfsd4_sequence_done(struct nfsd4_compoundres *resp) { struct nfsd4_compound_state *cs = &resp->cstate; if (nfsd4_has_session(cs)) { if (cs->status != nfserr_replay_cache) { nfsd4_store_cache_entry(resp); cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; } /* Drop session reference that was taken in nfsd4_sequence() */ nfsd4_put_session(cs->session); } else if (cs->clp) put_client_renew(cs->clp); } __be32 nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_destroy_clientid *dc = &u->destroy_clientid; struct nfs4_client *conf, *unconf; struct nfs4_client *clp = NULL; __be32 status = 0; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); spin_lock(&nn->client_lock); unconf = find_unconfirmed_client(&dc->clientid, true, nn); conf = find_confirmed_client(&dc->clientid, true, nn); WARN_ON_ONCE(conf && unconf); if (conf) { if (client_has_state(conf)) { status = nfserr_clientid_busy; goto out; } status = mark_client_expired_locked(conf); if (status) goto out; clp = conf; } else if (unconf) clp = unconf; else { status = nfserr_stale_clientid; goto out; } if (!nfsd4_mach_creds_match(clp, rqstp)) { clp = NULL; status = nfserr_wrong_cred; goto out; } trace_nfsd_clid_destroyed(&clp->cl_clientid); unhash_client_locked(clp); out: spin_unlock(&nn->client_lock); if (clp) expire_client(clp); return status; } __be32 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_reclaim_complete *rc = &u->reclaim_complete; struct nfs4_client *clp = cstate->clp; __be32 status = 0; if (rc->rca_one_fs) { if (!cstate->current_fh.fh_dentry) return nfserr_nofilehandle; /* * We don't take advantage of the rca_one_fs case. * That's OK, it's optional, we can safely ignore it. */ return nfs_ok; } status = nfserr_complete_already; if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) goto out; status = nfserr_stale_clientid; if (is_client_expired(clp)) /* * The following error isn't really legal. * But we only get here if the client just explicitly * destroyed the client. Surely it no longer cares what * error it gets back on an operation for the dead * client. */ goto out; status = nfs_ok; trace_nfsd_clid_reclaim_complete(&clp->cl_clientid); nfsd4_client_record_create(clp); inc_reclaim_complete(clp); out: return status; } __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_setclientid *setclid = &u->setclientid; struct xdr_netobj clname = setclid->se_name; nfs4_verifier clverifier = setclid->se_verf; struct nfs4_client *conf, *new; struct nfs4_client *unconf = NULL; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); new = create_client(clname, rqstp, &clverifier); if (new == NULL) return nfserr_jukebox; spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&clname, nn); if (conf && client_has_state(conf)) { status = nfserr_clid_inuse; if (clp_used_exchangeid(conf)) goto out; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } } unconf = find_unconfirmed_client_by_name(&clname, nn); if (unconf) unhash_client_locked(unconf); if (conf) { if (same_verf(&conf->cl_verifier, &clverifier)) { copy_clid(new, conf); gen_confirm(new, nn); } else trace_nfsd_clid_verf_mismatch(conf, rqstp, &clverifier); } else trace_nfsd_clid_fresh(new); new->cl_minorversion = 0; gen_callback(new, setclid, rqstp); add_to_unconfirmed(new); setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; setclid->se_clientid.cl_id = new->cl_clientid.cl_id; memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); new = NULL; status = nfs_ok; out: spin_unlock(&nn->client_lock); if (new) free_client(new); if (unconf) { trace_nfsd_clid_expire_unconf(&unconf->cl_clientid); expire_client(unconf); } return status; } __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_setclientid_confirm *setclientid_confirm = &u->setclientid_confirm; struct nfs4_client *conf, *unconf; struct nfs4_client *old = NULL; nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; spin_lock(&nn->client_lock); conf = find_confirmed_client(clid, false, nn); unconf = find_unconfirmed_client(clid, false, nn); /* * We try hard to give out unique clientid's, so if we get an * attempt to confirm the same clientid with a different cred, * the client may be buggy; this should never happen. * * Nevertheless, RFC 7530 recommends INUSE for this case: */ status = nfserr_clid_inuse; if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { trace_nfsd_clid_cred_mismatch(unconf, rqstp); goto out; } if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) { trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { if (conf && same_verf(&confirm, &conf->cl_confirm)) { status = nfs_ok; } else status = nfserr_stale_clientid; goto out; } status = nfs_ok; if (conf) { old = unconf; unhash_client_locked(old); nfsd4_change_callback(conf, &unconf->cl_cb_conn); } else { old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = nfserr_clid_inuse; if (client_has_state(old) && !same_creds(&unconf->cl_cred, &old->cl_cred)) { old = NULL; goto out; } status = mark_client_expired_locked(old); if (status) { old = NULL; goto out; } trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; } get_client_locked(conf); spin_unlock(&nn->client_lock); if (conf == unconf) fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); nfsd4_probe_callback(conf); spin_lock(&nn->client_lock); put_client_renew_locked(conf); out: spin_unlock(&nn->client_lock); if (old) expire_client(old); return status; } static struct nfs4_file *nfsd4_alloc_file(void) { return kmem_cache_alloc(file_slab, GFP_KERNEL); } /* OPEN Share state helper functions */ static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp) { refcount_set(&fp->fi_ref, 1); spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); INIT_LIST_HEAD(&fp->fi_clnt_odstate); fh_copy_shallow(&fp->fi_fhandle, &fh->fh_handle); fp->fi_deleg_file = NULL; fp->fi_had_conflict = false; fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); fp->fi_aliased = false; fp->fi_inode = d_inode(fh->fh_dentry); #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&fp->fi_lo_states); atomic_set(&fp->fi_lo_recalls, 0); #endif } void nfsd4_free_slabs(void) { kmem_cache_destroy(client_slab); kmem_cache_destroy(openowner_slab); kmem_cache_destroy(lockowner_slab); kmem_cache_destroy(file_slab); kmem_cache_destroy(stateid_slab); kmem_cache_destroy(deleg_slab); kmem_cache_destroy(odstate_slab); } int nfsd4_init_slabs(void) { client_slab = KMEM_CACHE(nfs4_client, 0); if (client_slab == NULL) goto out; openowner_slab = KMEM_CACHE(nfs4_openowner, 0); if (openowner_slab == NULL) goto out_free_client_slab; lockowner_slab = KMEM_CACHE(nfs4_lockowner, 0); if (lockowner_slab == NULL) goto out_free_openowner_slab; file_slab = KMEM_CACHE(nfs4_file, 0); if (file_slab == NULL) goto out_free_lockowner_slab; stateid_slab = KMEM_CACHE(nfs4_ol_stateid, 0); if (stateid_slab == NULL) goto out_free_file_slab; deleg_slab = KMEM_CACHE(nfs4_delegation, 0); if (deleg_slab == NULL) goto out_free_stateid_slab; odstate_slab = KMEM_CACHE(nfs4_clnt_odstate, 0); if (odstate_slab == NULL) goto out_free_deleg_slab; return 0; out_free_deleg_slab: kmem_cache_destroy(deleg_slab); out_free_stateid_slab: kmem_cache_destroy(stateid_slab); out_free_file_slab: kmem_cache_destroy(file_slab); out_free_lockowner_slab: kmem_cache_destroy(lockowner_slab); out_free_openowner_slab: kmem_cache_destroy(openowner_slab); out_free_client_slab: kmem_cache_destroy(client_slab); out: return -ENOMEM; } static unsigned long nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { int count; struct nfsd_net *nn = shrink->private_data; count = atomic_read(&nn->nfsd_courtesy_clients); if (!count) count = atomic_long_read(&num_delegations); if (count) queue_work(laundry_wq, &nn->nfsd_shrinker_work); return (unsigned long)count; } static unsigned long nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { return SHRINK_STOP; } void nfsd4_init_leases_net(struct nfsd_net *nn) { struct sysinfo si; u64 max_clients; nn->nfsd4_lease = 90; /* default lease time */ nn->nfsd4_grace = 90; nn->somebody_reclaimed = false; nn->track_reclaim_completes = false; nn->clverifier_counter = get_random_u32(); nn->clientid_base = get_random_u32(); nn->clientid_counter = nn->clientid_base + 1; nn->s2s_cp_cl_id = nn->clientid_counter++; atomic_set(&nn->nfs4_client_count, 0); si_meminfo(&si); max_clients = (u64)si.totalram * si.mem_unit / (1024 * 1024 * 1024); max_clients *= NFS4_CLIENTS_PER_GB; nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); atomic_set(&nn->nfsd_courtesy_clients, 0); } enum rp_lock { RP_UNLOCKED, RP_LOCKED, RP_UNHASHED, }; static void init_nfs4_replay(struct nfs4_replay *rp) { rp->rp_status = nfserr_serverfault; rp->rp_buflen = 0; rp->rp_buf = rp->rp_ibuf; atomic_set(&rp->rp_locked, RP_UNLOCKED); } static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so) { if (!nfsd4_has_session(cstate)) { wait_var_event(&so->so_replay.rp_locked, atomic_cmpxchg(&so->so_replay.rp_locked, RP_UNLOCKED, RP_LOCKED) != RP_LOCKED); if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED) return -EAGAIN; cstate->replay_owner = nfs4_get_stateowner(so); } return 0; } void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate) { struct nfs4_stateowner *so = cstate->replay_owner; if (so != NULL) { cstate->replay_owner = NULL; atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED); smp_mb__after_atomic(); wake_up_var(&so->so_replay.rp_locked); nfs4_put_stateowner(so); } } static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp) { struct nfs4_stateowner *sop; sop = kmem_cache_alloc(slab, GFP_KERNEL); if (!sop) return NULL; xdr_netobj_dup(&sop->so_owner, owner, GFP_KERNEL); if (!sop->so_owner.data) { kmem_cache_free(slab, sop); return NULL; } INIT_LIST_HEAD(&sop->so_stateids); sop->so_client = clp; init_nfs4_replay(&sop->so_replay); atomic_set(&sop->so_count, 1); return sop; } static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) { lockdep_assert_held(&clp->cl_lock); list_add(&oo->oo_owner.so_strhash, &clp->cl_ownerstr_hashtbl[strhashval]); list_add(&oo->oo_perclient, &clp->cl_openowners); } static void nfs4_unhash_openowner(struct nfs4_stateowner *so) { unhash_openowner_locked(openowner(so)); } static void nfs4_free_openowner(struct nfs4_stateowner *so) { struct nfs4_openowner *oo = openowner(so); kmem_cache_free(openowner_slab, oo); } static const struct nfs4_stateowner_operations openowner_ops = { .so_unhash = nfs4_unhash_openowner, .so_free = nfs4_free_openowner, }; static struct nfs4_ol_stateid * nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_ol_stateid *local, *ret = NULL; struct nfs4_openowner *oo = open->op_openowner; lockdep_assert_held(&fp->fi_lock); list_for_each_entry(local, &fp->fi_stateids, st_perfile) { /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; if (local->st_stateowner != &oo->oo_owner) continue; if (local->st_stid.sc_type == SC_TYPE_OPEN && !local->st_stid.sc_status) { ret = local; refcount_inc(&ret->st_stid.sc_count); break; } } return ret; } static void nfsd4_drop_revoked_stid(struct nfs4_stid *s) __releases(&s->sc_client->cl_lock) { struct nfs4_client *cl = s->sc_client; LIST_HEAD(reaplist); struct nfs4_ol_stateid *stp; struct nfs4_delegation *dp; bool unhashed; switch (s->sc_type) { case SC_TYPE_OPEN: stp = openlockstateid(s); if (unhash_open_stateid(stp, &reaplist)) put_ol_stateid_locked(stp, &reaplist); spin_unlock(&cl->cl_lock); free_ol_stateid_reaplist(&reaplist); break; case SC_TYPE_LOCK: stp = openlockstateid(s); unhashed = unhash_lock_stateid(stp); spin_unlock(&cl->cl_lock); if (unhashed) nfs4_put_stid(s); break; case SC_TYPE_DELEG: dp = delegstateid(s); list_del_init(&dp->dl_recall_lru); spin_unlock(&cl->cl_lock); nfs4_put_stid(s); break; default: spin_unlock(&cl->cl_lock); } } static void nfsd40_drop_revoked_stid(struct nfs4_client *cl, stateid_t *stid) { /* NFSv4.0 has no way for the client to tell the server * that it can forget an admin-revoked stateid. * So we keep it around until the first time that the * client uses it, and drop it the first time * nfserr_admin_revoked is returned. * For v4.1 and later we wait until explicitly told * to free the stateid. */ if (cl->cl_minorversion == 0) { struct nfs4_stid *st; spin_lock(&cl->cl_lock); st = find_stateid_locked(cl, stid); if (st) nfsd4_drop_revoked_stid(st); else spin_unlock(&cl->cl_lock); } } static __be32 nfsd4_verify_open_stid(struct nfs4_stid *s) { __be32 ret = nfs_ok; if (s->sc_status & SC_STATUS_ADMIN_REVOKED) ret = nfserr_admin_revoked; else if (s->sc_status & SC_STATUS_REVOKED) ret = nfserr_deleg_revoked; else if (s->sc_status & SC_STATUS_CLOSED) ret = nfserr_bad_stateid; return ret; } /* Lock the stateid st_mutex, and deal with races with CLOSE */ static __be32 nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp) { __be32 ret; mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); ret = nfsd4_verify_open_stid(&stp->st_stid); if (ret == nfserr_admin_revoked) nfsd40_drop_revoked_stid(stp->st_stid.sc_client, &stp->st_stid.sc_stateid); if (ret != nfs_ok) mutex_unlock(&stp->st_mutex); return ret; } static struct nfs4_ol_stateid * nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_ol_stateid *stp; for (;;) { spin_lock(&fp->fi_lock); stp = nfsd4_find_existing_open(fp, open); spin_unlock(&fp->fi_lock); if (!stp || nfsd4_lock_ol_stateid(stp) == nfs_ok) break; nfs4_put_stid(&stp->st_stid); } return stp; } static struct nfs4_openowner * find_or_alloc_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, struct nfsd4_compound_state *cstate) { struct nfs4_client *clp = cstate->clp; struct nfs4_openowner *oo, *new = NULL; retry: spin_lock(&clp->cl_lock); oo = find_openstateowner_str(strhashval, open, clp); if (!oo && new) { hash_openowner(new, clp, strhashval); spin_unlock(&clp->cl_lock); return new; } spin_unlock(&clp->cl_lock); if (oo && !(oo->oo_flags & NFS4_OO_CONFIRMED)) { /* Replace unconfirmed owners without checking for replay. */ release_openowner(oo); oo = NULL; } if (oo) { if (new) nfs4_free_stateowner(&new->oo_owner); return oo; } new = alloc_stateowner(openowner_slab, &open->op_owner, clp); if (!new) return NULL; new->oo_owner.so_ops = &openowner_ops; new->oo_owner.so_is_open_owner = 1; new->oo_owner.so_seqid = open->op_seqid; new->oo_flags = 0; if (nfsd4_has_session(cstate)) new->oo_flags |= NFS4_OO_CONFIRMED; new->oo_time = 0; new->oo_last_closed_stid = NULL; INIT_LIST_HEAD(&new->oo_close_lru); goto retry; } static struct nfs4_ol_stateid * init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; struct nfs4_ol_stateid *retstp = NULL; struct nfs4_ol_stateid *stp; stp = open->op_stp; /* We are moving these outside of the spinlocks to avoid the warnings */ mutex_init(&stp->st_mutex); mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); retry: spin_lock(&oo->oo_owner.so_client->cl_lock); spin_lock(&fp->fi_lock); if (nfs4_openowner_unhashed(oo)) { mutex_unlock(&stp->st_mutex); stp = NULL; goto out_unlock; } retstp = nfsd4_find_existing_open(fp, open); if (retstp) goto out_unlock; open->op_stp = NULL; refcount_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = SC_TYPE_OPEN; INIT_LIST_HEAD(&stp->st_locks); stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; stp->st_openstp = NULL; list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&oo->oo_owner.so_client->cl_lock); if (retstp) { /* Handle races with CLOSE */ if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { nfs4_put_stid(&retstp->st_stid); goto retry; } /* To keep mutex tracking happy */ mutex_unlock(&stp->st_mutex); stp = retstp; } return stp; } /* * In the 4.0 case we need to keep the owners around a little while to handle * CLOSE replay. We still do need to release any file access that is held by * them before returning however. */ static void move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) { struct nfs4_ol_stateid *last; struct nfs4_openowner *oo = openowner(s->st_stateowner); struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net, nfsd_net_id); dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); /* * We know that we hold one reference via nfsd4_close, and another * "persistent" reference for the client. If the refcount is higher * than 2, then there are still calls in progress that are using this * stateid. We can't put the sc_file reference until they are finished. * Wait for the refcount to drop to 2. Since it has been unhashed, * there should be no danger of the refcount going back up again at * this point. * Some threads with a reference might be waiting for rp_locked, * so tell them to stop waiting. */ atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED); smp_mb__after_atomic(); wake_up_var(&oo->oo_owner.so_replay.rp_locked); wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2); release_all_access(s); if (s->st_stid.sc_file) { put_nfs4_file(s->st_stid.sc_file); s->st_stid.sc_file = NULL; } spin_lock(&nn->client_lock); last = oo->oo_last_closed_stid; oo->oo_last_closed_stid = s; list_move_tail(&oo->oo_close_lru, &nn->close_lru); oo->oo_time = ktime_get_boottime_seconds(); spin_unlock(&nn->client_lock); if (last) nfs4_put_stid(&last->st_stid); } static noinline_for_stack struct nfs4_file * nfsd4_file_hash_lookup(const struct svc_fh *fhp) { struct inode *inode = d_inode(fhp->fh_dentry); struct rhlist_head *tmp, *list; struct nfs4_file *fi; rcu_read_lock(); list = rhltable_lookup(&nfs4_file_rhltable, &inode, nfs4_file_rhash_params); rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { if (refcount_inc_not_zero(&fi->fi_ref)) { rcu_read_unlock(); return fi; } } } rcu_read_unlock(); return NULL; } /* * On hash insertion, identify entries with the same inode but * distinct filehandles. They will all be on the list returned * by rhltable_lookup(). * * inode->i_lock prevents racing insertions from adding an entry * for the same inode/fhp pair twice. */ static noinline_for_stack struct nfs4_file * nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) { struct inode *inode = d_inode(fhp->fh_dentry); struct rhlist_head *tmp, *list; struct nfs4_file *ret = NULL; bool alias_found = false; struct nfs4_file *fi; int err; rcu_read_lock(); spin_lock(&inode->i_lock); list = rhltable_lookup(&nfs4_file_rhltable, &inode, nfs4_file_rhash_params); rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { if (refcount_inc_not_zero(&fi->fi_ref)) ret = fi; } else fi->fi_aliased = alias_found = true; } if (ret) goto out_unlock; nfsd4_file_init(fhp, new); err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist, nfs4_file_rhash_params); if (err) goto out_unlock; new->fi_aliased = alias_found; ret = new; out_unlock: spin_unlock(&inode->i_lock); rcu_read_unlock(); return ret; } static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) { rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist, nfs4_file_rhash_params); } /* * Called to check deny when READ with all zero stateid or * WRITE with all zero or all one stateid */ static __be32 nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) { struct nfs4_file *fp; __be32 ret = nfs_ok; fp = nfsd4_file_hash_lookup(current_fh); if (!fp) return ret; /* Check for conflicting share reservations */ spin_lock(&fp->fi_lock); if (fp->fi_share_deny & deny_type) ret = nfserr_locked; spin_unlock(&fp->fi_lock); put_nfs4_file(fp); return ret; } static bool nfsd4_deleg_present(const struct inode *inode) { struct file_lock_context *ctx = locks_inode_context(inode); return ctx && !list_empty_careful(&ctx->flc_lease); } /** * nfsd_wait_for_delegreturn - wait for delegations to be returned * @rqstp: the RPC transaction being executed * @inode: in-core inode of the file being waited for * * The timeout prevents deadlock if all nfsd threads happen to be * tied up waiting for returning delegations. * * Return values: * %true: delegation was returned * %false: timed out waiting for delegreturn */ bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode) { long __maybe_unused timeo; timeo = wait_var_event_timeout(inode, !nfsd4_deleg_present(inode), NFSD_DELEGRETURN_TIMEOUT); trace_nfsd_delegret_wakeup(rqstp, inode, timeo); return timeo > 0; } static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) { struct nfs4_delegation *dp = cb_to_delegation(cb); struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, nfsd_net_id); block_delegations(&dp->dl_stid.sc_file->fi_fhandle); /* * We can't do this in nfsd_break_deleg_cb because it is * already holding inode->i_lock. * * If the dl_time != 0, then we know that it has already been * queued for a lease break. Don't queue it again. */ spin_lock(&state_lock); if (delegation_hashed(dp) && dp->dl_time == 0) { dp->dl_time = ktime_get_boottime_seconds(); list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); } spin_unlock(&state_lock); } static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, struct rpc_task *task) { struct nfs4_delegation *dp = cb_to_delegation(cb); trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task); if (dp->dl_stid.sc_status) /* CLOSED or REVOKED */ return 1; switch (task->tk_status) { case 0: return 1; case -NFS4ERR_DELAY: rpc_delay(task, 2 * HZ); return 0; case -EBADHANDLE: case -NFS4ERR_BAD_STATEID: /* * Race: client probably got cb_recall before open reply * granting delegation. */ if (dp->dl_retries--) { rpc_delay(task, 2 * HZ); return 0; } fallthrough; default: return 1; } } static void nfsd4_cb_recall_release(struct nfsd4_callback *cb) { struct nfs4_delegation *dp = cb_to_delegation(cb); nfs4_put_stid(&dp->dl_stid); } static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = { .prepare = nfsd4_cb_recall_prepare, .done = nfsd4_cb_recall_done, .release = nfsd4_cb_recall_release, .opcode = OP_CB_RECALL, }; static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { /* * We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the * flc_lock) we know the server hasn't removed the lease yet, and * we know it's safe to take a reference. */ refcount_inc(&dp->dl_stid.sc_count); WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall)); } /* Called from break_lease() with flc_lock held. */ static bool nfsd_break_deleg_cb(struct file_lease *fl) { struct nfs4_delegation *dp = (struct nfs4_delegation *) fl->c.flc_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; struct nfs4_client *clp = dp->dl_stid.sc_client; struct nfsd_net *nn; trace_nfsd_cb_recall(&dp->dl_stid); dp->dl_recalled = true; atomic_inc(&clp->cl_delegs_in_recall); if (try_to_expire_client(clp)) { nn = net_generic(clp->net, nfsd_net_id); mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); } /* * We don't want the locks code to timeout the lease for us; * we'll remove it ourself if a delegation isn't returned * in time: */ fl->fl_break_time = 0; fp->fi_had_conflict = true; nfsd_break_one_deleg(dp); return false; } /** * nfsd_breaker_owns_lease - Check if lease conflict was resolved * @fl: Lock state to check * * Return values: * %true: Lease conflict was resolved * %false: Lease conflict was not resolved. */ static bool nfsd_breaker_owns_lease(struct file_lease *fl) { struct nfs4_delegation *dl = fl->c.flc_owner; struct svc_rqst *rqst; struct nfs4_client *clp; rqst = nfsd_current_rqst(); if (!nfsd_v4client(rqst)) return false; clp = *(rqst->rq_lease_breaker); return dl->dl_stid.sc_client == clp; } static int nfsd_change_deleg_cb(struct file_lease *onlist, int arg, struct list_head *dispose) { struct nfs4_delegation *dp = (struct nfs4_delegation *) onlist->c.flc_owner; struct nfs4_client *clp = dp->dl_stid.sc_client; if (arg & F_UNLCK) { if (dp->dl_recalled) atomic_dec(&clp->cl_delegs_in_recall); return lease_modify(onlist, arg, dispose); } else return -EAGAIN; } static const struct lease_manager_operations nfsd_lease_mng_ops = { .lm_breaker_owns_lease = nfsd_breaker_owns_lease, .lm_break = nfsd_break_deleg_cb, .lm_change = nfsd_change_deleg_cb, }; static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid) { if (nfsd4_has_session(cstate)) return nfs_ok; if (seqid == so->so_seqid - 1) return nfserr_replay_me; if (seqid == so->so_seqid) return nfs_ok; return nfserr_bad_seqid; } static struct nfs4_client *lookup_clientid(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct nfs4_client *found; spin_lock(&nn->client_lock); found = find_confirmed_client(clid, sessions, nn); if (found) atomic_inc(&found->cl_rpc_users); spin_unlock(&nn->client_lock); return found; } static __be32 set_client(clientid_t *clid, struct nfsd4_compound_state *cstate, struct nfsd_net *nn) { if (cstate->clp) { if (!same_clid(&cstate->clp->cl_clientid, clid)) return nfserr_stale_clientid; return nfs_ok; } if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; /* * We're in the 4.0 case (otherwise the SEQUENCE op would have * set cstate->clp), so session = false: */ cstate->clp = lookup_clientid(clid, false, nn); if (!cstate->clp) return nfserr_expired; return nfs_ok; } __be32 nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct nfsd_net *nn) { clientid_t *clientid = &open->op_clientid; struct nfs4_client *clp = NULL; unsigned int strhashval; struct nfs4_openowner *oo = NULL; __be32 status; /* * In case we need it later, after we've already created the * file and don't want to risk a further failure: */ open->op_file = nfsd4_alloc_file(); if (open->op_file == NULL) return nfserr_jukebox; status = set_client(clientid, cstate, nn); if (status) return status; clp = cstate->clp; strhashval = ownerstr_hashval(&open->op_owner); retry: oo = find_or_alloc_open_stateowner(strhashval, open, cstate); open->op_openowner = oo; if (!oo) return nfserr_jukebox; if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) { nfs4_put_stateowner(&oo->oo_owner); goto retry; } status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); if (status) return status; open->op_stp = nfs4_alloc_open_stateid(clp); if (!open->op_stp) return nfserr_jukebox; if (nfsd4_has_session(cstate) && (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) { open->op_odstate = alloc_clnt_odstate(clp); if (!open->op_odstate) return nfserr_jukebox; } return nfs_ok; } static inline __be32 nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) { if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ)) return nfserr_openmode; else return nfs_ok; } static int share_access_to_flags(u32 share_access) { return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) { struct nfs4_stid *ret; ret = find_stateid_by_type(cl, s, SC_TYPE_DELEG, SC_STATUS_REVOKED); if (!ret) return NULL; return delegstateid(ret); } static bool nfsd4_is_deleg_cur(struct nfsd4_open *open) { return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR || open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH; } static __be32 nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, struct nfs4_delegation **dp) { int flags; __be32 status = nfserr_bad_stateid; struct nfs4_delegation *deleg; deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); if (deleg == NULL) goto out; if (deleg->dl_stid.sc_status & SC_STATUS_ADMIN_REVOKED) { nfs4_put_stid(&deleg->dl_stid); status = nfserr_admin_revoked; goto out; } if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) { nfs4_put_stid(&deleg->dl_stid); nfsd40_drop_revoked_stid(cl, &open->op_delegate_stateid); status = nfserr_deleg_revoked; goto out; } flags = share_access_to_flags(open->op_share_access); status = nfs4_check_delegmode(deleg, flags); if (status) { nfs4_put_stid(&deleg->dl_stid); goto out; } *dp = deleg; out: if (!nfsd4_is_deleg_cur(open)) return nfs_ok; if (status) return status; open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; return nfs_ok; } static inline int nfs4_access_to_access(u32 nfs4_access) { int flags = 0; if (nfs4_access & NFS4_SHARE_ACCESS_READ) flags |= NFSD_MAY_READ; if (nfs4_access & NFS4_SHARE_ACCESS_WRITE) flags |= NFSD_MAY_WRITE; return flags; } static inline __be32 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, struct nfsd4_open *open) { struct iattr iattr = { .ia_valid = ATTR_SIZE, .ia_size = 0, }; struct nfsd_attrs attrs = { .na_iattr = &iattr, }; if (!open->op_truncate) return 0; if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) return nfserr_inval; return nfsd_setattr(rqstp, fh, &attrs, NULL); } static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open, bool new_stp) { struct nfsd_file *nf = NULL; __be32 status; int oflag = nfs4_access_to_omode(open->op_share_access); int access = nfs4_access_to_access(open->op_share_access); unsigned char old_access_bmap, old_deny_bmap; spin_lock(&fp->fi_lock); /* * Are we trying to set a deny mode that would conflict with * current access? */ status = nfs4_file_check_deny(fp, open->op_share_deny); if (status != nfs_ok) { if (status != nfserr_share_denied) { spin_unlock(&fp->fi_lock); goto out; } if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, stp, open->op_share_deny, false)) status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } /* set access to the file */ status = nfs4_file_get_access(fp, open->op_share_access); if (status != nfs_ok) { if (status != nfserr_share_denied) { spin_unlock(&fp->fi_lock); goto out; } if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, stp, open->op_share_access, true)) status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } /* Set access bits in stateid */ old_access_bmap = stp->st_access_bmap; set_access(open->op_share_access, stp); /* Set new deny mask */ old_deny_bmap = stp->st_deny_bmap; set_deny(open->op_share_deny, stp); fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); status = nfsd_file_acquire_opened(rqstp, cur_fh, access, open->op_filp, &nf); if (status != nfs_ok) goto out_put_access; spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { fp->fi_fds[oflag] = nf; nf = NULL; } } spin_unlock(&fp->fi_lock); if (nf) nfsd_file_put(nf); status = nfserrno(nfsd_open_break_lease(cur_fh->fh_dentry->d_inode, access)); if (status) goto out_put_access; status = nfsd4_truncate(rqstp, cur_fh, open); if (status) goto out_put_access; out: return status; out_put_access: stp->st_access_bmap = old_access_bmap; nfs4_file_put_access(fp, open->op_share_access); reset_union_bmap_deny(bmap_to_share_mode(old_deny_bmap), stp); goto out; } static __be32 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { __be32 status; unsigned char old_deny_bmap = stp->st_deny_bmap; if (!test_access(open->op_share_access, stp)) return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open, false); /* test and set deny mode */ spin_lock(&fp->fi_lock); status = nfs4_file_check_deny(fp, open->op_share_deny); switch (status) { case nfs_ok: set_deny(open->op_share_deny, stp); fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); break; case nfserr_share_denied: if (nfs4_resolve_deny_conflicts_locked(fp, false, stp, open->op_share_deny, false)) status = nfserr_jukebox; break; } spin_unlock(&fp->fi_lock); if (status != nfs_ok) return status; status = nfsd4_truncate(rqstp, cur_fh, open); if (status != nfs_ok) reset_union_bmap_deny(old_deny_bmap, stp); return status; } /* Should we give out recallable state?: */ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) { if (clp->cl_cb_state == NFSD4_CB_UP) return true; /* * In the sessions case, since we don't have to establish a * separate connection for callbacks, we assume it's OK * until we hear otherwise: */ return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; } static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag) { struct file_lease *fl; fl = locks_alloc_lease(); if (!fl) return NULL; fl->fl_lmops = &nfsd_lease_mng_ops; fl->c.flc_flags = FL_DELEG; fl->c.flc_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; fl->c.flc_owner = (fl_owner_t)dp; fl->c.flc_pid = current->tgid; fl->c.flc_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; return fl; } static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, struct nfs4_file *fp) { struct nfs4_ol_stateid *st; struct file *f = fp->fi_deleg_file->nf_file; struct inode *ino = file_inode(f); int writes; writes = atomic_read(&ino->i_writecount); if (!writes) return 0; /* * There could be multiple filehandles (hence multiple * nfs4_files) referencing this file, but that's not too * common; let's just give up in that case rather than * trying to go look up all the clients using that other * nfs4_file as well: */ if (fp->fi_aliased) return -EAGAIN; /* * If there's a close in progress, make sure that we see it * clear any fi_fds[] entries before we see it decrement * i_writecount: */ smp_mb__after_atomic(); if (fp->fi_fds[O_WRONLY]) writes--; if (fp->fi_fds[O_RDWR]) writes--; if (writes > 0) return -EAGAIN; /* There may be non-NFSv4 writers */ /* * It's possible there are non-NFSv4 write opens in progress, * but if they haven't incremented i_writecount yet then they * also haven't called break lease yet; so, they'll break this * lease soon enough. So, all that's left to check for is NFSv4 * opens: */ spin_lock(&fp->fi_lock); list_for_each_entry(st, &fp->fi_stateids, st_perfile) { if (st->st_openstp == NULL /* it's an open */ && access_permit_write(st) && st->st_stid.sc_client != clp) { spin_unlock(&fp->fi_lock); return -EAGAIN; } } spin_unlock(&fp->fi_lock); /* * There's a small chance that we could be racing with another * NFSv4 open. However, any open that hasn't added itself to * the fi_stateids list also hasn't called break_lease yet; so, * they'll break this lease soon enough. */ return 0; } /* * It's possible that between opening the dentry and setting the delegation, * that it has been renamed or unlinked. Redo the lookup to verify that this * hasn't happened. */ static int nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp, struct svc_fh *parent) { struct svc_export *exp; struct dentry *child; __be32 err; err = nfsd_lookup_dentry(open->op_rqstp, parent, open->op_fname, open->op_fnamelen, &exp, &child); if (err) return -EAGAIN; exp_put(exp); dput(child); if (child != file_dentry(fp->fi_deleg_file->nf_file)) return -EAGAIN; return 0; } /* * We avoid breaking delegations held by a client due to its own activity, but * clearing setuid/setgid bits on a write is an implicit activity and the client * may not notice and continue using the old mode. Avoid giving out a delegation * on setuid/setgid files when the client is requesting an open for write. */ static int nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf) { struct inode *inode = file_inode(nf->nf_file); if ((open->op_share_access & NFS4_SHARE_ACCESS_WRITE) && (inode->i_mode & (S_ISUID|S_ISGID))) return -EAGAIN; return 0; } static struct nfs4_delegation * nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *parent) { int status = 0; struct nfs4_client *clp = stp->st_stid.sc_client; struct nfs4_file *fp = stp->st_stid.sc_file; struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; struct nfs4_delegation *dp; struct nfsd_file *nf = NULL; struct file_lease *fl; u32 dl_type; /* * The fi_had_conflict and nfs_get_existing_delegation checks * here are just optimizations; we'll need to recheck them at * the end: */ if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); /* * Try for a write delegation first. RFC8881 section 10.4 says: * * "An OPEN_DELEGATE_WRITE delegation allows the client to handle, * on its own, all opens." * * Furthermore the client can use a write delegation for most READ * operations as well, so we require a O_RDWR file here. * * Offer a write delegation in the case of a BOTH open, and ensure * we get the O_RDWR descriptor. */ if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) { nf = find_rw_file(fp); dl_type = NFS4_OPEN_DELEGATE_WRITE; } /* * If the file is being opened O_RDONLY or we couldn't get a O_RDWR * file for some reason, then try for a read delegation instead. */ if (!nf && (open->op_share_access & NFS4_SHARE_ACCESS_READ)) { nf = find_readable_file(fp); dl_type = NFS4_OPEN_DELEGATE_READ; } if (!nf) return ERR_PTR(-EAGAIN); spin_lock(&state_lock); spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) status = -EAGAIN; else if (nfsd4_verify_setuid_write(open, nf)) status = -EAGAIN; else if (!fp->fi_deleg_file) { fp->fi_deleg_file = nf; /* increment early to prevent fi_deleg_file from being * cleared */ fp->fi_delegees = 1; nf = NULL; } else fp->fi_delegees++; spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); if (nf) nfsd_file_put(nf); if (status) return ERR_PTR(status); status = -ENOMEM; dp = alloc_init_deleg(clp, fp, odstate, dl_type); if (!dp) goto out_delegees; fl = nfs4_alloc_init_lease(dp, dl_type); if (!fl) goto out_clnt_odstate; status = kernel_setlease(fp->fi_deleg_file->nf_file, fl->c.flc_type, &fl, NULL); if (fl) locks_free_lease(fl); if (status) goto out_clnt_odstate; if (parent) { status = nfsd4_verify_deleg_dentry(open, fp, parent); if (status) goto out_unlock; } status = nfsd4_check_conflicting_opens(clp, fp); if (status) goto out_unlock; /* * Now that the deleg is set, check again to ensure that nothing * raced in and changed the mode while we weren't looking. */ status = nfsd4_verify_setuid_write(open, fp->fi_deleg_file); if (status) goto out_unlock; status = -EAGAIN; if (fp->fi_had_conflict) goto out_unlock; spin_lock(&state_lock); spin_lock(&clp->cl_lock); spin_lock(&fp->fi_lock); status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&clp->cl_lock); spin_unlock(&state_lock); if (status) goto out_unlock; return dp; out_unlock: kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); out_delegees: put_deleg_file(fp); return ERR_PTR(status); } static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) { open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; if (status == -EAGAIN) open->op_why_no_deleg = WND4_CONTENTION; else { open->op_why_no_deleg = WND4_RESOURCE; switch (open->op_deleg_want) { case NFS4_SHARE_WANT_READ_DELEG: case NFS4_SHARE_WANT_WRITE_DELEG: case NFS4_SHARE_WANT_ANY_DELEG: break; case NFS4_SHARE_WANT_CANCEL: open->op_why_no_deleg = WND4_CANCELLED; break; case NFS4_SHARE_WANT_NO_DELEG: WARN_ON_ONCE(1); } } } static bool nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh, struct kstat *stat) { struct nfsd_file *nf = find_rw_file(dp->dl_stid.sc_file); struct path path; int rc; if (!nf) return false; path.mnt = currentfh->fh_export->ex_path.mnt; path.dentry = file_dentry(nf->nf_file); rc = vfs_getattr(&path, stat, (STATX_MODE | STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE), AT_STATX_SYNC_AS_STAT); nfsd_file_put(nf); return rc == 0; } /* * The Linux NFS server does not offer write delegations to NFSv4.0 * clients in order to avoid conflicts between write delegations and * GETATTRs requesting CHANGE or SIZE attributes. * * With NFSv4.1 and later minorversions, the SEQUENCE operation that * begins each COMPOUND contains a client ID. Delegation recall can * be avoided when the server recognizes the client sending a * GETATTR also holds write delegation it conflicts with. * * However, the NFSv4.0 protocol does not enable a server to * determine that a GETATTR originated from the client holding the * conflicting delegation versus coming from some other client. Per * RFC 7530 Section 16.7.5, the server must recall or send a * CB_GETATTR even when the GETATTR originates from the client that * holds the conflicting delegation. * * An NFSv4.0 client can trigger a pathological situation if it * always sends a DELEGRETURN preceded by a conflicting GETATTR in * the same COMPOUND. COMPOUND execution will always stop at the * GETATTR and the DELEGRETURN will never get executed. The server * eventually revokes the delegation, which can result in loss of * open or lock state. */ static void nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *currentfh) { struct nfs4_delegation *dp; struct nfs4_openowner *oo = openowner(stp->st_stateowner); struct nfs4_client *clp = stp->st_stid.sc_client; struct svc_fh *parent = NULL; int cb_up; int status = 0; struct kstat stat; cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); open->op_recall = false; switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_PREVIOUS: if (!cb_up) open->op_recall = true; break; case NFS4_OPEN_CLAIM_NULL: parent = currentfh; fallthrough; case NFS4_OPEN_CLAIM_FH: /* * Let's not give out any delegations till everyone's * had the chance to reclaim theirs, *and* until * NLM locks have all been reclaimed: */ if (locks_in_grace(clp->net)) goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE && !clp->cl_minorversion) goto out_no_deleg; break; default: goto out_no_deleg; } dp = nfs4_set_delegation(open, stp, parent); if (IS_ERR(dp)) goto out_no_deleg; memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { if (!nfs4_delegation_stat(dp, currentfh, &stat)) { nfs4_put_stid(&dp->dl_stid); destroy_delegation(dp); goto out_no_deleg; } open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE; dp->dl_cb_fattr.ncf_cur_fsize = stat.size; dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat); trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); } else { open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); } nfs4_put_stid(&dp->dl_stid); return; out_no_deleg: open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) { dprintk("NFSD: WARNING: refusing delegation reclaim\n"); open->op_recall = true; } /* 4.1 client asking for a delegation? */ if (open->op_deleg_want) nfsd4_open_deleg_none_ext(open, status); return; } static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, struct nfs4_delegation *dp) { if (open->op_deleg_want == NFS4_SHARE_WANT_READ_DELEG && dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) { open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE; } else if (open->op_deleg_want == NFS4_SHARE_WANT_WRITE_DELEG && dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) { open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE; } /* Otherwise the client must be confused wanting a delegation * it already has, therefore we don't return * NFS4_OPEN_DELEGATE_NONE_EXT and reason. */ } /** * nfsd4_process_open2 - finish open processing * @rqstp: the RPC transaction being executed * @current_fh: NFSv4 COMPOUND's current filehandle * @open: OPEN arguments * * If successful, (1) truncate the file if open->op_truncate was * set, (2) set open->op_stateid, (3) set open->op_delegation. * * Returns %nfs_ok on success; otherwise an nfs4stat value in * network byte order is returned. */ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; bool new_stp = false; /* * Lookup file; if found, lookup stateid and check open request, * and check for delegations in the process of being recalled. * If not found, create the nfs4_file struct */ fp = nfsd4_file_hash_insert(open->op_file, current_fh); if (unlikely(!fp)) return nfserr_jukebox; if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; stp = nfsd4_find_and_lock_existing_open(fp, open); } else { open->op_file = NULL; status = nfserr_bad_stateid; if (nfsd4_is_deleg_cur(open)) goto out; } if (!stp) { stp = init_open_stateid(fp, open); if (!stp) { status = nfserr_jukebox; goto out; } if (!open->op_stp) new_stp = true; } /* * OPEN the file, or upgrade an existing OPEN. * If truncate fails, the OPEN fails. * * stp is already locked. */ if (!new_stp) { /* Stateid was found, this is an OPEN upgrade */ status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { mutex_unlock(&stp->st_mutex); goto out; } } else { status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true); if (status) { release_open_stateid(stp); mutex_unlock(&stp->st_mutex); goto out; } stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp, open->op_odstate); if (stp->st_clnt_odstate == open->op_odstate) open->op_odstate = NULL; } nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); mutex_unlock(&stp->st_mutex); if (nfsd4_has_session(&resp->cstate)) { if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; open->op_why_no_deleg = WND4_NOT_WANTED; goto nodeleg; } } /* * Attempt to hand out a delegation. No error return, because the * OPEN succeeds even if we fail. */ nfs4_open_delegation(open, stp, &resp->cstate.current_fh); nodeleg: status = nfs_ok; trace_nfsd_open(&stp->st_stid.sc_stateid); out: /* 4.1 client trying to upgrade/downgrade delegation? */ if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp && open->op_deleg_want) nfsd4_deleg_xgrade_none_ext(open, dp); if (fp) put_nfs4_file(fp); if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; /* * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; if (nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK; else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; if (dp) nfs4_put_stid(&dp->dl_stid); if (stp) nfs4_put_stid(&stp->st_stid); return status; } void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { if (open->op_openowner) nfs4_put_stateowner(&open->op_openowner->oo_owner); if (open->op_file) kmem_cache_free(file_slab, open->op_file); if (open->op_stp) nfs4_put_stid(&open->op_stp->st_stid); if (open->op_odstate) kmem_cache_free(odstate_slab, open->op_odstate); } __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { clientid_t *clid = &u->renew; struct nfs4_client *clp; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); trace_nfsd_clid_renew(clid); status = set_client(clid, cstate, nn); if (status) return status; clp = cstate->clp; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) return nfserr_cb_path_down; return nfs_ok; } void nfsd4_end_grace(struct nfsd_net *nn) { /* do nothing if grace period already ended */ if (nn->grace_ended) return; trace_nfsd_grace_complete(nn); nn->grace_ended = true; /* * If the server goes down again right now, an NFSv4 * client will still be allowed to reclaim after it comes back up, * even if it hasn't yet had a chance to reclaim state this time. * */ nfsd4_record_grace_done(nn); /* * At this point, NFSv4 clients can still reclaim. But if the * server crashes, any that have not yet reclaimed will be out * of luck on the next boot. * * (NFSv4.1+ clients are considered to have reclaimed once they * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to * have reclaimed after their first OPEN.) */ locks_end_grace(&nn->nfsd4_manager); /* * At this point, and once lockd and/or any other containers * exit their grace period, further reclaims will fail and * regular locking can resume. */ } /* * If we've waited a lease period but there are still clients trying to * reclaim, wait a little longer to give them a chance to finish. */ static bool clients_still_reclaiming(struct nfsd_net *nn) { time64_t double_grace_period_end = nn->boot_time + 2 * nn->nfsd4_lease; if (nn->track_reclaim_completes && atomic_read(&nn->nr_reclaim_complete) == nn->reclaim_str_hashtbl_size) return false; if (!nn->somebody_reclaimed) return false; nn->somebody_reclaimed = false; /* * If we've given them *two* lease times to reclaim, and they're * still not done, give up: */ if (ktime_get_boottime_seconds() > double_grace_period_end) return false; return true; } struct laundry_time { time64_t cutoff; time64_t new_timeo; }; static bool state_expired(struct laundry_time *lt, time64_t last_refresh) { time64_t time_remaining; if (last_refresh < lt->cutoff) return true; time_remaining = last_refresh - lt->cutoff; lt->new_timeo = min(lt->new_timeo, time_remaining); return false; } #ifdef CONFIG_NFSD_V4_2_INTER_SSC void nfsd4_ssc_init_umount_work(struct nfsd_net *nn) { spin_lock_init(&nn->nfsd_ssc_lock); INIT_LIST_HEAD(&nn->nfsd_ssc_mount_list); init_waitqueue_head(&nn->nfsd_ssc_waitq); } /* * This is called when nfsd is being shutdown, after all inter_ssc * cleanup were done, to destroy the ssc delayed unmount list. */ static void nfsd4_ssc_shutdown_umount(struct nfsd_net *nn) { struct nfsd4_ssc_umount_item *ni = NULL; struct nfsd4_ssc_umount_item *tmp; spin_lock(&nn->nfsd_ssc_lock); list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { list_del(&ni->nsui_list); spin_unlock(&nn->nfsd_ssc_lock); mntput(ni->nsui_vfsmount); kfree(ni); spin_lock(&nn->nfsd_ssc_lock); } spin_unlock(&nn->nfsd_ssc_lock); } static void nfsd4_ssc_expire_umount(struct nfsd_net *nn) { bool do_wakeup = false; struct nfsd4_ssc_umount_item *ni = NULL; struct nfsd4_ssc_umount_item *tmp; spin_lock(&nn->nfsd_ssc_lock); list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { if (time_after(jiffies, ni->nsui_expire)) { if (refcount_read(&ni->nsui_refcnt) > 1) continue; /* mark being unmount */ ni->nsui_busy = true; spin_unlock(&nn->nfsd_ssc_lock); mntput(ni->nsui_vfsmount); spin_lock(&nn->nfsd_ssc_lock); /* waiters need to start from begin of list */ list_del(&ni->nsui_list); kfree(ni); /* wakeup ssc_connect waiters */ do_wakeup = true; continue; } break; } if (do_wakeup) wake_up_all(&nn->nfsd_ssc_waitq); spin_unlock(&nn->nfsd_ssc_lock); } #endif /* Check if any lock belonging to this lockowner has any blockers */ static bool nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo) { struct file_lock_context *ctx; struct nfs4_ol_stateid *stp; struct nfs4_file *nf; list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { nf = stp->st_stid.sc_file; ctx = locks_inode_context(nf->fi_inode); if (!ctx) continue; if (locks_owner_has_blockers(ctx, lo)) return true; } return false; } static bool nfs4_anylock_blockers(struct nfs4_client *clp) { int i; struct nfs4_stateowner *so; struct nfs4_lockowner *lo; if (atomic_read(&clp->cl_delegs_in_recall)) return true; spin_lock(&clp->cl_lock); for (i = 0; i < OWNER_HASH_SIZE; i++) { list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[i], so_strhash) { if (so->so_is_open_owner) continue; lo = lockowner(so); if (nfs4_lockowner_has_blockers(lo)) { spin_unlock(&clp->cl_lock); return true; } } } spin_unlock(&clp->cl_lock); return false; } static void nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, struct laundry_time *lt) { unsigned int maxreap, reapcnt = 0; struct list_head *pos, *next; struct nfs4_client *clp; maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ? NFSD_CLIENT_MAX_TRIM_PER_RUN : 0; INIT_LIST_HEAD(reaplist); spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (clp->cl_state == NFSD4_EXPIRABLE) goto exp_client; if (!state_expired(lt, clp->cl_time)) break; if (!atomic_read(&clp->cl_rpc_users)) { if (clp->cl_state == NFSD4_ACTIVE) atomic_inc(&nn->nfsd_courtesy_clients); clp->cl_state = NFSD4_COURTESY; } if (!client_has_state(clp)) goto exp_client; if (!nfs4_anylock_blockers(clp)) if (reapcnt >= maxreap) continue; exp_client: if (!mark_client_expired_locked(clp)) { list_add(&clp->cl_lru, reaplist); reapcnt++; } } spin_unlock(&nn->client_lock); } static void nfs4_get_courtesy_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist) { unsigned int maxreap = 0, reapcnt = 0; struct list_head *pos, *next; struct nfs4_client *clp; maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN; INIT_LIST_HEAD(reaplist); spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (clp->cl_state == NFSD4_ACTIVE) break; if (reapcnt >= maxreap) break; if (!mark_client_expired_locked(clp)) { list_add(&clp->cl_lru, reaplist); reapcnt++; } } spin_unlock(&nn->client_lock); } static void nfs4_process_client_reaplist(struct list_head *reaplist) { struct list_head *pos, *next; struct nfs4_client *clp; list_for_each_safe(pos, next, reaplist) { clp = list_entry(pos, struct nfs4_client, cl_lru); trace_nfsd_clid_purged(&clp->cl_clientid); list_del_init(&clp->cl_lru); expire_client(clp); } } static void nfs40_clean_admin_revoked(struct nfsd_net *nn, struct laundry_time *lt) { struct nfs4_client *clp; spin_lock(&nn->client_lock); if (nn->nfs40_last_revoke == 0 || nn->nfs40_last_revoke > lt->cutoff) { spin_unlock(&nn->client_lock); return; } nn->nfs40_last_revoke = 0; retry: list_for_each_entry(clp, &nn->client_lru, cl_lru) { unsigned long id, tmp; struct nfs4_stid *stid; if (atomic_read(&clp->cl_admin_revoked) == 0) continue; spin_lock(&clp->cl_lock); idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { refcount_inc(&stid->sc_count); spin_unlock(&nn->client_lock); /* this function drops ->cl_lock */ nfsd4_drop_revoked_stid(stid); nfs4_put_stid(stid); spin_lock(&nn->client_lock); goto retry; } spin_unlock(&clp->cl_lock); } spin_unlock(&nn->client_lock); } static time64_t nfs4_laundromat(struct nfsd_net *nn) { struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct nfs4_ol_stateid *stp; struct nfsd4_blocked_lock *nbl; struct list_head *pos, *next, reaplist; struct laundry_time lt = { .cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease, .new_timeo = nn->nfsd4_lease }; struct nfs4_cpntf_state *cps; copy_stateid_t *cps_t; int i; if (clients_still_reclaiming(nn)) { lt.new_timeo = 0; goto out; } nfsd4_end_grace(nn); spin_lock(&nn->s2s_cp_lock); idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); if (cps->cp_stateid.cs_type == NFS4_COPYNOTIFY_STID && state_expired(&lt, cps->cpntf_time)) _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); nfsd4_async_copy_reaper(nn); nfs4_get_client_reaplist(nn, &reaplist, &lt); nfs4_process_client_reaplist(&reaplist); nfs40_clean_admin_revoked(nn, &lt); spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (!state_expired(&lt, dp->dl_time)) break; refcount_inc(&dp->dl_stid.sc_count); unhash_delegation_locked(dp, SC_STATUS_REVOKED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); while (!list_empty(&reaplist)) { dp = list_first_entry(&reaplist, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); revoke_delegation(dp); } spin_lock(&nn->client_lock); while (!list_empty(&nn->close_lru)) { oo = list_first_entry(&nn->close_lru, struct nfs4_openowner, oo_close_lru); if (!state_expired(&lt, oo->oo_time)) break; list_del_init(&oo->oo_close_lru); stp = oo->oo_last_closed_stid; oo->oo_last_closed_stid = NULL; spin_unlock(&nn->client_lock); nfs4_put_stid(&stp->st_stid); spin_lock(&nn->client_lock); } spin_unlock(&nn->client_lock); /* * It's possible for a client to try and acquire an already held lock * that is being held for a long time, and then lose interest in it. * So, we clean out any un-revisited request after a lease period * under the assumption that the client is no longer interested. * * RFC5661, sec. 9.6 states that the client must not rely on getting * notifications and must continue to poll for locks, even when the * server supports them. Thus this shouldn't lead to clients blocking * indefinitely once the lock does become free. */ BUG_ON(!list_empty(&reaplist)); spin_lock(&nn->blocked_locks_lock); while (!list_empty(&nn->blocked_locks_lru)) { nbl = list_first_entry(&nn->blocked_locks_lru, struct nfsd4_blocked_lock, nbl_lru); if (!state_expired(&lt, nbl->nbl_time)) break; list_move(&nbl->nbl_lru, &reaplist); list_del_init(&nbl->nbl_list); } spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); free_blocked_lock(nbl); } #ifdef CONFIG_NFSD_V4_2_INTER_SSC /* service the server-to-server copy delayed unmount list */ nfsd4_ssc_expire_umount(nn); #endif if (atomic_long_read(&num_delegations) >= max_delegations) deleg_reaper(nn); out: return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } static void laundromat_main(struct work_struct *); static void laundromat_main(struct work_struct *laundry) { time64_t t; struct delayed_work *dwork = to_delayed_work(laundry); struct nfsd_net *nn = container_of(dwork, struct nfsd_net, laundromat_work); t = nfs4_laundromat(nn); queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); } static void courtesy_client_reaper(struct nfsd_net *nn) { struct list_head reaplist; nfs4_get_courtesy_client_reaplist(nn, &reaplist); nfs4_process_client_reaplist(&reaplist); } static void deleg_reaper(struct nfsd_net *nn) { struct list_head *pos, *next; struct nfs4_client *clp; LIST_HEAD(cblist); spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (clp->cl_state != NFSD4_ACTIVE || list_empty(&clp->cl_delegations) || atomic_read(&clp->cl_delegs_in_recall) || test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) || (ktime_get_boottime_seconds() - clp->cl_ra_time < 5)) { continue; } list_add(&clp->cl_ra_cblist, &cblist); /* release in nfsd4_cb_recall_any_release */ kref_get(&clp->cl_nfsdfs.cl_ref); set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); clp->cl_ra_time = ktime_get_boottime_seconds(); } spin_unlock(&nn->client_lock); while (!list_empty(&cblist)) { clp = list_first_entry(&cblist, struct nfs4_client, cl_ra_cblist); list_del_init(&clp->cl_ra_cblist); clp->cl_ra->ra_keep = 0; clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) | BIT(RCA4_TYPE_MASK_WDATA_DLG); trace_nfsd_cb_recall_any(clp->cl_ra); nfsd4_run_cb(&clp->cl_ra->ra_cb); } } static void nfsd4_state_shrinker_worker(struct work_struct *work) { struct nfsd_net *nn = container_of(work, struct nfsd_net, nfsd_shrinker_work); courtesy_client_reaper(nn); deleg_reaper(nn); } static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) { if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) return nfserr_bad_stateid; return nfs_ok; } static __be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags) { __be32 status = nfserr_openmode; /* For lock stateid's, we test the parent open, not the lock: */ if (stp->st_openstp) stp = stp->st_openstp; if ((flags & WR_STATE) && !access_permit_write(stp)) goto out; if ((flags & RD_STATE) && !access_permit_read(stp)) goto out; status = nfs_ok; out: return status; } static inline __be32 check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, int flags) { if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; else if (opens_in_grace(net)) { /* Answer in remaining cases depends on existence of * conflicting state; so we must wait out the grace period. */ return nfserr_grace; } else if (flags & WR_STATE) return nfs4_share_conflict(current_fh, NFS4_SHARE_DENY_WRITE); else /* (flags & RD_STATE) && ZERO_STATEID(stateid) */ return nfs4_share_conflict(current_fh, NFS4_SHARE_DENY_READ); } static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* * When sessions are used the stateid generation number is ignored * when it is zero. */ if (has_session && in->si_generation == 0) return nfs_ok; if (in->si_generation == ref->si_generation) return nfs_ok; /* If the client sends us a stateid from the future, it's buggy: */ if (nfsd4_stateid_generation_after(in, ref)) return nfserr_bad_stateid; /* * However, we could see a stateid from the past, even from a * non-buggy client. For example, if the client sends a lock * while some IO is outstanding, the lock may bump si_generation * while the IO is still in flight. The client could avoid that * situation by waiting for responses on all the IO requests, * but better performance may result in retrying IO that * receives an old_stateid error if requests are rarely * reordered in flight: */ return nfserr_old_stateid; } static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_stid *s, bool has_session) { __be32 ret; spin_lock(&s->sc_lock); ret = nfsd4_verify_open_stid(s); if (ret == nfs_ok) ret = check_stateid_generation(in, &s->sc_stateid, has_session); spin_unlock(&s->sc_lock); if (ret == nfserr_admin_revoked) nfsd40_drop_revoked_stid(s->sc_client, &s->sc_stateid); return ret; } static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols) { if (ols->st_stateowner->so_is_open_owner && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) return nfserr_bad_stateid; return nfs_ok; } static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { struct nfs4_stid *s; __be32 status = nfserr_bad_stateid; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return status; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); if (!s) goto out_unlock; status = nfsd4_stid_check_stateid_generation(stateid, s, 1); if (status) goto out_unlock; status = nfsd4_verify_open_stid(s); if (status) goto out_unlock; switch (s->sc_type) { case SC_TYPE_DELEG: status = nfs_ok; break; case SC_TYPE_OPEN: case SC_TYPE_LOCK: status = nfsd4_check_openowner_confirmed(openlockstateid(s)); break; default: printk("unknown stateid type %x\n", s->sc_type); status = nfserr_bad_stateid; } out_unlock: spin_unlock(&cl->cl_lock); if (status == nfserr_admin_revoked) nfsd40_drop_revoked_stid(cl, stateid); return status; } __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned short typemask, unsigned short statusmask, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; struct nfs4_stid *stid; bool return_revoked = false; /* * only return revoked delegations if explicitly asked. * otherwise we report revoked or bad_stateid status. */ if (statusmask & SC_STATUS_REVOKED) return_revoked = true; if (typemask & SC_TYPE_DELEG) /* Always allow REVOKED for DELEG so we can * retturn the appropriate error. */ statusmask |= SC_STATUS_REVOKED; statusmask |= SC_STATUS_ADMIN_REVOKED; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return nfserr_bad_stateid; status = set_client(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { if (cstate->session) return nfserr_bad_stateid; return nfserr_stale_stateid; } if (status) return status; stid = find_stateid_by_type(cstate->clp, stateid, typemask, statusmask); if (!stid) return nfserr_bad_stateid; if ((stid->sc_status & SC_STATUS_REVOKED) && !return_revoked) { nfs4_put_stid(stid); return nfserr_deleg_revoked; } if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { nfsd40_drop_revoked_stid(cstate->clp, stateid); nfs4_put_stid(stid); return nfserr_admin_revoked; } *s = stid; return nfs_ok; } static struct nfsd_file * nfs4_find_file(struct nfs4_stid *s, int flags) { struct nfsd_file *ret = NULL; if (!s || s->sc_status) return NULL; switch (s->sc_type) { case SC_TYPE_DELEG: spin_lock(&s->sc_file->fi_lock); ret = nfsd_file_get(s->sc_file->fi_deleg_file); spin_unlock(&s->sc_file->fi_lock); break; case SC_TYPE_OPEN: case SC_TYPE_LOCK: if (flags & RD_STATE) ret = find_readable_file(s->sc_file); else ret = find_writeable_file(s->sc_file); } return ret; } static __be32 nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags) { __be32 status; status = nfsd4_check_openowner_confirmed(ols); if (status) return status; return nfs4_check_openmode(ols, flags); } static __be32 nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, struct nfsd_file **nfp, int flags) { int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; struct nfsd_file *nf; __be32 status; nf = nfs4_find_file(s, flags); if (nf) { status = nfsd_permission(&rqstp->rq_cred, fhp->fh_export, fhp->fh_dentry, acc | NFSD_MAY_OWNER_OVERRIDE); if (status) { nfsd_file_put(nf); goto out; } } else { status = nfsd_file_acquire(rqstp, fhp, acc, &nf); if (status) return status; } *nfp = nf; out: return status; } static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) { WARN_ON_ONCE(cps->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID); if (!refcount_dec_and_test(&cps->cp_stateid.cs_count)) return; list_del(&cps->cp_list); idr_remove(&nn->s2s_cp_stateids, cps->cp_stateid.cs_stid.si_opaque.so_id); kfree(cps); } /* * A READ from an inter server to server COPY will have a * copy stateid. Look up the copy notify stateid from the * idr structure and take a reference on it. */ __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st, struct nfs4_client *clp, struct nfs4_cpntf_state **cps) { copy_stateid_t *cps_t; struct nfs4_cpntf_state *state = NULL; if (st->si_opaque.so_clid.cl_id != nn->s2s_cp_cl_id) return nfserr_bad_stateid; spin_lock(&nn->s2s_cp_lock); cps_t = idr_find(&nn->s2s_cp_stateids, st->si_opaque.so_id); if (cps_t) { state = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); if (state->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID) { state = NULL; goto unlock; } if (!clp) refcount_inc(&state->cp_stateid.cs_count); else _free_cpntf_state_locked(nn, state); } unlock: spin_unlock(&nn->s2s_cp_lock); if (!state) return nfserr_bad_stateid; if (!clp) *cps = state; return 0; } static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st, struct nfs4_stid **stid) { __be32 status; struct nfs4_cpntf_state *cps = NULL; struct nfs4_client *found; status = manage_cpntf_state(nn, st, NULL, &cps); if (status) return status; cps->cpntf_time = ktime_get_boottime_seconds(); status = nfserr_expired; found = lookup_clientid(&cps->cp_p_clid, true, nn); if (!found) goto out; *stid = find_stateid_by_type(found, &cps->cp_p_stateid, SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, 0); if (*stid) status = nfs_ok; else status = nfserr_bad_stateid; put_client_renew(found); out: nfs4_put_cpntf_state(nn, cps); return status; } void nfs4_put_cpntf_state(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) { spin_lock(&nn->s2s_cp_lock); _free_cpntf_state_locked(nn, cps); spin_unlock(&nn->s2s_cp_lock); } /** * nfs4_preprocess_stateid_op - find and prep stateid for an operation * @rqstp: incoming request from client * @cstate: current compound state * @fhp: filehandle associated with requested stateid * @stateid: stateid (provided by client) * @flags: flags describing type of operation to be done * @nfp: optional nfsd_file return pointer (may be NULL) * @cstid: optional returned nfs4_stid pointer (may be NULL) * * Given info from the client, look up a nfs4_stid for the operation. On * success, it returns a reference to the nfs4_stid and/or the nfsd_file * associated with it. */ __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, stateid_t *stateid, int flags, struct nfsd_file **nfp, struct nfs4_stid **cstid) { struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct nfs4_stid *s = NULL; __be32 status; if (nfp) *nfp = NULL; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { status = check_special_stateids(net, fhp, stateid, flags); goto done; } status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, 0, &s, nn); if (status == nfserr_bad_stateid) status = find_cpntf_state(nn, stateid, &s); if (status) return status; status = nfsd4_stid_check_stateid_generation(stateid, s, nfsd4_has_session(cstate)); if (status) goto out; switch (s->sc_type) { case SC_TYPE_DELEG: status = nfs4_check_delegmode(delegstateid(s), flags); break; case SC_TYPE_OPEN: case SC_TYPE_LOCK: status = nfs4_check_olstateid(openlockstateid(s), flags); break; } if (status) goto out; status = nfs4_check_fh(fhp, s); done: if (status == nfs_ok && nfp) status = nfs4_check_file(rqstp, fhp, s, nfp, flags); out: if (s) { if (!status && cstid) *cstid = s; else nfs4_put_stid(s); } return status; } /* * Test if the stateid is valid */ __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct nfsd4_test_stateid_id *stateid; struct nfs4_client *cl = cstate->clp; list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) stateid->ts_id_status = nfsd4_validate_stateid(cl, &stateid->ts_id_stateid); return nfs_ok; } static __be32 nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) { struct nfs4_ol_stateid *stp = openlockstateid(s); __be32 ret; ret = nfsd4_lock_ol_stateid(stp); if (ret) goto out_put_stid; ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) goto out; ret = nfserr_locks_held; if (check_for_locks(stp->st_stid.sc_file, lockowner(stp->st_stateowner))) goto out; release_lock_stateid(stp); ret = nfs_ok; out: mutex_unlock(&stp->st_mutex); out_put_stid: nfs4_put_stid(s); return ret; } __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_free_stateid *free_stateid = &u->free_stateid; stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; struct nfs4_client *cl = cstate->clp; __be32 ret = nfserr_bad_stateid; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); if (!s || s->sc_status & SC_STATUS_CLOSED) goto out_unlock; if (s->sc_status & SC_STATUS_ADMIN_REVOKED) { nfsd4_drop_revoked_stid(s); ret = nfs_ok; goto out; } spin_lock(&s->sc_lock); switch (s->sc_type) { case SC_TYPE_DELEG: if (s->sc_status & SC_STATUS_REVOKED) { s->sc_status |= SC_STATUS_CLOSED; spin_unlock(&s->sc_lock); dp = delegstateid(s); if (s->sc_status & SC_STATUS_FREEABLE) list_del_init(&dp->dl_recall_lru); s->sc_status |= SC_STATUS_FREED; spin_unlock(&cl->cl_lock); nfs4_put_stid(s); ret = nfs_ok; goto out; } ret = nfserr_locks_held; break; case SC_TYPE_OPEN: ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) break; ret = nfserr_locks_held; break; case SC_TYPE_LOCK: spin_unlock(&s->sc_lock); refcount_inc(&s->sc_count); spin_unlock(&cl->cl_lock); ret = nfsd4_free_lock_stateid(stateid, s); goto out; } spin_unlock(&s->sc_lock); out_unlock: spin_unlock(&cl->cl_lock); out: return ret; } static inline int setlkflg (int type) { return (type == NFS4_READW_LT || type == NFS4_READ_LT) ? RD_STATE : WR_STATE; } static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp) { struct svc_fh *current_fh = &cstate->current_fh; struct nfs4_stateowner *sop = stp->st_stateowner; __be32 status; status = nfsd4_check_seqid(cstate, sop, seqid); if (status) return status; status = nfsd4_lock_ol_stateid(stp); if (status != nfs_ok) return status; status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status == nfs_ok) status = nfs4_check_fh(current_fh, &stp->st_stid); if (status != nfs_ok) mutex_unlock(&stp->st_mutex); return status; } /** * nfs4_preprocess_seqid_op - find and prep an ol_stateid for a seqid-morphing op * @cstate: compund state * @seqid: seqid (provided by client) * @stateid: stateid (provided by client) * @typemask: mask of allowable types for this operation * @statusmask: mask of allowed states: 0 or STID_CLOSED * @stpp: return pointer for the stateid found * @nn: net namespace for request * * Given a stateid+seqid from a client, look up an nfs4_ol_stateid and * return it in @stpp. On a nfs_ok return, the returned stateid will * have its st_mutex locked. */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, unsigned short typemask, unsigned short statusmask, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn) { __be32 status; struct nfs4_stid *s; struct nfs4_ol_stateid *stp = NULL; trace_nfsd_preprocess(seqid, stateid); *stpp = NULL; retry: status = nfsd4_lookup_stateid(cstate, stateid, typemask, statusmask, &s, nn); if (status) return status; stp = openlockstateid(s); if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) { nfs4_put_stateowner(stp->st_stateowner); goto retry; } status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp); if (!status) *stpp = stp; else nfs4_put_stid(&stp->st_stid); return status; } static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn) { __be32 status; struct nfs4_openowner *oo; struct nfs4_ol_stateid *stp; status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, SC_TYPE_OPEN, 0, &stp, nn); if (status) return status; oo = openowner(stp->st_stateowner); if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); return nfserr_bad_stateid; } *stpp = stp; return nfs_ok; } __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_open_confirm *oc = &u->open_confirm; __be32 status; struct nfs4_openowner *oo; struct nfs4_ol_stateid *stp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); dprintk("NFSD: nfsd4_open_confirm on file %pd\n", cstate->current_fh.fh_dentry); status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); if (status) return status; status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, SC_TYPE_OPEN, 0, &stp, nn); if (status) goto out; oo = openowner(stp->st_stateowner); status = nfserr_bad_stateid; if (oo->oo_flags & NFS4_OO_CONFIRMED) { mutex_unlock(&stp->st_mutex); goto put_stateid; } oo->oo_flags |= NFS4_OO_CONFIRMED; nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid); mutex_unlock(&stp->st_mutex); trace_nfsd_open_confirm(oc->oc_seqid, &stp->st_stid.sc_stateid); nfsd4_client_record_create(oo->oo_owner.so_client); status = nfs_ok; put_stateid: nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); return status; } static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access) { if (!test_access(access, stp)) return; nfs4_file_put_access(stp->st_stid.sc_file, access); clear_access(access, stp); } static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access) { switch (to_access) { case NFS4_SHARE_ACCESS_READ: nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE); nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); break; case NFS4_SHARE_ACCESS_WRITE: nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ); nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); break; case NFS4_SHARE_ACCESS_BOTH: break; default: WARN_ON_ONCE(1); } } __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_open_downgrade *od = &u->open_downgrade; __be32 status; struct nfs4_ol_stateid *stp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); dprintk("NFSD: nfsd4_open_downgrade on file %pd\n", cstate->current_fh.fh_dentry); /* We don't yet support WANT bits: */ if (od->od_deleg_want) dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__, od->od_deleg_want); status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, &od->od_stateid, &stp, nn); if (status) goto out; status = nfserr_inval; if (!test_access(od->od_share_access, stp)) { dprintk("NFSD: access not a subset of current bitmap: 0x%hhx, input access=%08x\n", stp->st_access_bmap, od->od_share_access); goto put_stateid; } if (!test_deny(od->od_share_deny, stp)) { dprintk("NFSD: deny not a subset of current bitmap: 0x%hhx, input deny=%08x\n", stp->st_deny_bmap, od->od_share_deny); goto put_stateid; } nfs4_stateid_downgrade(stp, od->od_share_access); reset_union_bmap_deny(od->od_share_deny, stp); nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid); status = nfs_ok; put_stateid: mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); return status; } static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) { struct nfs4_client *clp = s->st_stid.sc_client; bool unhashed; LIST_HEAD(reaplist); struct nfs4_ol_stateid *stp; spin_lock(&clp->cl_lock); unhashed = unhash_open_stateid(s, &reaplist); if (clp->cl_minorversion) { if (unhashed) put_ol_stateid_locked(s, &reaplist); spin_unlock(&clp->cl_lock); list_for_each_entry(stp, &reaplist, st_locks) nfs4_free_cpntf_statelist(clp->net, &stp->st_stid); free_ol_stateid_reaplist(&reaplist); return false; } else { spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); return unhashed; } } /* * nfs4_unlock_state() called after encode */ __be32 nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_close *close = &u->close; __be32 status; struct nfs4_ol_stateid *stp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool need_move_to_close_list; dprintk("NFSD: nfsd4_close on file %pd\n", cstate->current_fh.fh_dentry); status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, &close->cl_stateid, SC_TYPE_OPEN, SC_STATUS_CLOSED, &stp, nn); nfsd4_bump_seqid(cstate, status); if (status) goto out; spin_lock(&stp->st_stid.sc_client->cl_lock); stp->st_stid.sc_status |= SC_STATUS_CLOSED; spin_unlock(&stp->st_stid.sc_client->cl_lock); /* * Technically we don't _really_ have to increment or copy it, since * it should just be gone after this operation and we clobber the * copied value below, but we continue to do so here just to ensure * that racing ops see that there was a state change. */ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); need_move_to_close_list = nfsd4_close_open_stateid(stp); mutex_unlock(&stp->st_mutex); if (need_move_to_close_list) move_to_close_lru(stp, net); /* v4.1+ suggests that we send a special stateid in here, since the * clients should just ignore this anyway. Since this is not useful * for v4.0 clients either, we set it to the special close_stateid * universally. * * See RFC5661 section 18.2.4, and RFC7530 section 16.2.5 */ memcpy(&close->cl_stateid, &close_stateid, sizeof(close->cl_stateid)); /* put reference from nfs4_preprocess_seqid_op */ nfs4_put_stid(&stp->st_stid); out: return status; } __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_delegreturn *dr = &u->delegreturn; struct nfs4_delegation *dp; stateid_t *stateid = &dr->dr_stateid; struct nfs4_stid *s; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED | SC_STATUS_FREEABLE, &s, nn); if (status) goto out; dp = delegstateid(s); status = nfsd4_stid_check_stateid_generation(stateid, &dp->dl_stid, nfsd4_has_session(cstate)); if (status) goto put_stateid; trace_nfsd_deleg_return(stateid); destroy_delegation(dp); smp_mb__after_atomic(); wake_up_var(d_inode(cstate->current_fh.fh_dentry)); put_stateid: nfs4_put_stid(&dp->dl_stid); out: return status; } /* last octet in a range */ static inline u64 last_byte_offset(u64 start, u64 len) { u64 end; WARN_ON_ONCE(!len); end = start + len; return end > start ? end - 1: NFS4_MAX_UINT64; } /* * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that * we can't properly handle lock requests that go beyond the (2^63 - 1)-th * byte, because of sign extension problems. Since NFSv4 calls for 64-bit * locking, this prevents us from being completely protocol-compliant. The * real solution to this problem is to start using unsigned file offsets in * the VFS, but this is a very deep change! */ static inline void nfs4_transform_lock_offset(struct file_lock *lock) { if (lock->fl_start < 0) lock->fl_start = OFFSET_MAX; if (lock->fl_end < 0) lock->fl_end = OFFSET_MAX; } static fl_owner_t nfsd4_lm_get_owner(fl_owner_t owner) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; nfs4_get_stateowner(&lo->lo_owner); return owner; } static void nfsd4_lm_put_owner(fl_owner_t owner) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; if (lo) nfs4_put_stateowner(&lo->lo_owner); } /* return pointer to struct nfs4_client if client is expirable */ static bool nfsd4_lm_lock_expirable(struct file_lock *cfl) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *) cfl->c.flc_owner; struct nfs4_client *clp = lo->lo_owner.so_client; struct nfsd_net *nn; if (try_to_expire_client(clp)) { nn = net_generic(clp->net, nfsd_net_id); mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); return true; } return false; } /* schedule laundromat to run immediately and wait for it to complete */ static void nfsd4_lm_expire_lock(void) { flush_workqueue(laundry_wq); } static void nfsd4_lm_notify(struct file_lock *fl) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *) fl->c.flc_owner; struct net *net = lo->lo_owner.so_client->net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct nfsd4_blocked_lock *nbl = container_of(fl, struct nfsd4_blocked_lock, nbl_lock); bool queue = false; /* An empty list means that something else is going to be using it */ spin_lock(&nn->blocked_locks_lock); if (!list_empty(&nbl->nbl_list)) { list_del_init(&nbl->nbl_list); list_del_init(&nbl->nbl_lru); queue = true; } spin_unlock(&nn->blocked_locks_lock); if (queue) { trace_nfsd_cb_notify_lock(lo, nbl); nfsd4_run_cb(&nbl->nbl_cb); } } static const struct lock_manager_operations nfsd_posix_mng_ops = { .lm_mod_owner = THIS_MODULE, .lm_notify = nfsd4_lm_notify, .lm_get_owner = nfsd4_lm_get_owner, .lm_put_owner = nfsd4_lm_put_owner, .lm_lock_expirable = nfsd4_lm_lock_expirable, .lm_expire_lock = nfsd4_lm_expire_lock, }; static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { struct nfs4_lockowner *lo; if (fl->fl_lmops == &nfsd_posix_mng_ops) { lo = (struct nfs4_lockowner *) fl->c.flc_owner; xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner, GFP_KERNEL); if (!deny->ld_owner.data) /* We just don't care that much */ goto nevermind; deny->ld_clientid = lo->lo_owner.so_client->cl_clientid; } else { nevermind: deny->ld_owner.len = 0; deny->ld_owner.data = NULL; deny->ld_clientid.cl_boot = 0; deny->ld_clientid.cl_id = 0; } deny->ld_start = fl->fl_start; deny->ld_length = NFS4_MAX_UINT64; if (fl->fl_end != NFS4_MAX_UINT64) deny->ld_length = fl->fl_end - fl->fl_start + 1; deny->ld_type = NFS4_READ_LT; if (fl->c.flc_type != F_RDLCK) deny->ld_type = NFS4_WRITE_LT; } static struct nfs4_lockowner * find_lockowner_str_locked(struct nfs4_client *clp, struct xdr_netobj *owner) { unsigned int strhashval = ownerstr_hashval(owner); struct nfs4_stateowner *so; lockdep_assert_held(&clp->cl_lock); list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[strhashval], so_strhash) { if (so->so_is_open_owner) continue; if (same_owner_str(so, owner)) return lockowner(nfs4_get_stateowner(so)); } return NULL; } static struct nfs4_lockowner * find_lockowner_str(struct nfs4_client *clp, struct xdr_netobj *owner) { struct nfs4_lockowner *lo; spin_lock(&clp->cl_lock); lo = find_lockowner_str_locked(clp, owner); spin_unlock(&clp->cl_lock); return lo; } static void nfs4_unhash_lockowner(struct nfs4_stateowner *sop) { unhash_lockowner_locked(lockowner(sop)); } static void nfs4_free_lockowner(struct nfs4_stateowner *sop) { struct nfs4_lockowner *lo = lockowner(sop); kmem_cache_free(lockowner_slab, lo); } static const struct nfs4_stateowner_operations lockowner_ops = { .so_unhash = nfs4_unhash_lockowner, .so_free = nfs4_free_lockowner, }; /* * Alloc a lock owner structure. * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has * occurred. * * strhashval = ownerstr_hashval */ static struct nfs4_lockowner * alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) { struct nfs4_lockowner *lo, *ret; lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); if (!lo) return NULL; INIT_LIST_HEAD(&lo->lo_blocked); INIT_LIST_HEAD(&lo->lo_owner.so_stateids); lo->lo_owner.so_is_open_owner = 0; lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; lo->lo_owner.so_ops = &lockowner_ops; spin_lock(&clp->cl_lock); ret = find_lockowner_str_locked(clp, &lock->lk_new_owner); if (ret == NULL) { list_add(&lo->lo_owner.so_strhash, &clp->cl_ownerstr_hashtbl[strhashval]); ret = lo; } else nfs4_free_stateowner(&lo->lo_owner); spin_unlock(&clp->cl_lock); return ret; } static struct nfs4_ol_stateid * find_lock_stateid(const struct nfs4_lockowner *lo, const struct nfs4_ol_stateid *ost) { struct nfs4_ol_stateid *lst; lockdep_assert_held(&ost->st_stid.sc_client->cl_lock); /* If ost is not hashed, ost->st_locks will not be valid */ if (!nfs4_ol_stateid_unhashed(ost)) list_for_each_entry(lst, &ost->st_locks, st_locks) { if (lst->st_stateowner == &lo->lo_owner) { refcount_inc(&lst->st_stid.sc_count); return lst; } } return NULL; } static struct nfs4_ol_stateid * init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, struct nfs4_file *fp, struct inode *inode, struct nfs4_ol_stateid *open_stp) { struct nfs4_client *clp = lo->lo_owner.so_client; struct nfs4_ol_stateid *retstp; mutex_init(&stp->st_mutex); mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); retry: spin_lock(&clp->cl_lock); if (nfs4_ol_stateid_unhashed(open_stp)) goto out_close; retstp = find_lock_stateid(lo, open_stp); if (retstp) goto out_found; refcount_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = SC_TYPE_LOCK; stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; spin_lock(&fp->fi_lock); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); spin_unlock(&fp->fi_lock); spin_unlock(&clp->cl_lock); return stp; out_found: spin_unlock(&clp->cl_lock); if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { nfs4_put_stid(&retstp->st_stid); goto retry; } /* To keep mutex tracking happy */ mutex_unlock(&stp->st_mutex); return retstp; out_close: spin_unlock(&clp->cl_lock); mutex_unlock(&stp->st_mutex); return NULL; } static struct nfs4_ol_stateid * find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, struct inode *inode, struct nfs4_ol_stateid *ost, bool *new) { struct nfs4_stid *ns = NULL; struct nfs4_ol_stateid *lst; struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *clp = oo->oo_owner.so_client; *new = false; spin_lock(&clp->cl_lock); lst = find_lock_stateid(lo, ost); spin_unlock(&clp->cl_lock); if (lst != NULL) { if (nfsd4_lock_ol_stateid(lst) == nfs_ok) goto out; nfs4_put_stid(&lst->st_stid); } ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); if (ns == NULL) return NULL; lst = init_lock_stateid(openlockstateid(ns), lo, fi, inode, ost); if (lst == openlockstateid(ns)) *new = true; else nfs4_put_stid(ns); out: return lst; } static int check_lock_length(u64 offset, u64 length) { return ((length == 0) || ((length != NFS4_MAX_UINT64) && (length > ~offset))); } static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) { struct nfs4_file *fp = lock_stp->st_stid.sc_file; lockdep_assert_held(&fp->fi_lock); if (test_access(access, lock_stp)) return; __nfs4_file_get_access(fp, access); set_access(access, lock_stp); } static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **plst, bool *new) { __be32 status; struct nfs4_file *fi = ost->st_stid.sc_file; struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *cl = oo->oo_owner.so_client; struct inode *inode = d_inode(cstate->current_fh.fh_dentry); struct nfs4_lockowner *lo; struct nfs4_ol_stateid *lst; unsigned int strhashval; lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { strhashval = ownerstr_hashval(&lock->lk_new_owner); lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); if (lo == NULL) return nfserr_jukebox; } else { /* with an existing lockowner, seqids must be the same */ status = nfserr_bad_seqid; if (!cstate->minorversion && lock->lk_new_lock_seqid != lo->lo_owner.so_seqid) goto out; } lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); if (lst == NULL) { status = nfserr_jukebox; goto out; } status = nfs_ok; *plst = lst; out: nfs4_put_stateowner(&lo->lo_owner); return status; } /* * LOCK operation */ __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_lock *lock = &u->lock; struct nfs4_openowner *open_sop = NULL; struct nfs4_lockowner *lock_sop = NULL; struct nfs4_ol_stateid *lock_stp = NULL; struct nfs4_ol_stateid *open_stp = NULL; struct nfs4_file *fp; struct nfsd_file *nf = NULL; struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; struct super_block *sb; __be32 status = 0; int lkflg; int err; bool new = false; unsigned char type; unsigned int flags = FL_POSIX; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", (long long) lock->lk_offset, (long long) lock->lk_length); if (check_lock_length(lock->lk_offset, lock->lk_length)) return nfserr_inval; status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); if (status != nfs_ok) return status; sb = cstate->current_fh.fh_dentry->d_sb; if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) /* See rfc 5661 18.10.3: given clientid is ignored: */ memcpy(&lock->lk_new_clientid, &cstate->clp->cl_clientid, sizeof(clientid_t)); /* validate and update open stateid and open seqid */ status = nfs4_preprocess_confirmed_seqid_op(cstate, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, &open_stp, nn); if (status) goto out; mutex_unlock(&open_stp->st_mutex); open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, &lock->lk_new_clientid)) goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, SC_TYPE_LOCK, 0, &lock_stp, nn); } if (status) goto out; lock_sop = lockowner(lock_stp->st_stateowner); lkflg = setlkflg(lock->lk_type); status = nfs4_check_openmode(lock_stp, lkflg); if (status) goto out; status = nfserr_grace; if (locks_in_grace(net) && !lock->lk_reclaim) goto out; status = nfserr_no_grace; if (!locks_in_grace(net) && lock->lk_reclaim) goto out; if (lock->lk_reclaim) flags |= FL_RECLAIM; fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); nf = find_readable_file_locked(fp); if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); type = F_RDLCK; break; case NFS4_WRITEW_LT: fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); nf = find_writeable_file_locked(fp); if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); type = F_WRLCK; break; default: status = nfserr_inval; goto out; } if (!nf) { status = nfserr_openmode; goto out; } if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) && nfsd4_has_session(cstate) && locks_can_async_lock(nf->nf_file->f_op)) flags |= FL_SLEEP; nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { dprintk("NFSD: %s: unable to allocate block!\n", __func__); status = nfserr_jukebox; goto out; } file_lock = &nbl->nbl_lock; file_lock->c.flc_type = type; file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->c.flc_pid = current->tgid; file_lock->c.flc_file = nf->nf_file; file_lock->c.flc_flags = flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); nfs4_transform_lock_offset(file_lock); conflock = locks_alloc_lock(); if (!conflock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; goto out; } if (flags & FL_SLEEP) { nbl->nbl_time = ktime_get_boottime_seconds(); spin_lock(&nn->blocked_locks_lock); list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); kref_get(&nbl->nbl_kref); spin_unlock(&nn->blocked_locks_lock); } err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock); switch (err) { case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); status = 0; if (lock->lk_reclaim) nn->somebody_reclaimed = true; break; case FILE_LOCK_DEFERRED: kref_put(&nbl->nbl_kref, free_nbl); nbl = NULL; fallthrough; case -EAGAIN: /* conflock holds conflicting lock */ status = nfserr_denied; dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); nfs4_set_lock_denied(conflock, &lock->lk_denied); break; case -EDEADLK: status = nfserr_deadlock; break; default: dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err); status = nfserrno(err); break; } out: if (nbl) { /* dequeue it if we queued it before */ if (flags & FL_SLEEP) { spin_lock(&nn->blocked_locks_lock); if (!list_empty(&nbl->nbl_list) && !list_empty(&nbl->nbl_lru)) { list_del_init(&nbl->nbl_list); list_del_init(&nbl->nbl_lru); kref_put(&nbl->nbl_kref, free_nbl); } /* nbl can use one of lists to be linked to reaplist */ spin_unlock(&nn->blocked_locks_lock); } free_blocked_lock(nbl); } if (nf) nfsd_file_put(nf); if (lock_stp) { /* Bump seqid manually if the 4.0 replay owner is openowner */ if (cstate->replay_owner && cstate->replay_owner != &lock_sop->lo_owner && seqid_mutating_err(ntohl(status))) lock_sop->lo_owner.so_seqid++; /* * If this is a new, never-before-used stateid, and we are * returning an error, then just go ahead and release it. */ if (status && new) release_lock_stateid(lock_stp); mutex_unlock(&lock_stp->st_mutex); nfs4_put_stid(&lock_stp->st_stid); } if (open_stp) nfs4_put_stid(&open_stp->st_stid); nfsd4_bump_seqid(cstate, status); if (conflock) locks_free_lock(conflock); return status; } void nfsd4_lock_release(union nfsd4_op_u *u) { struct nfsd4_lock *lock = &u->lock; struct nfsd4_lock_denied *deny = &lock->lk_denied; kfree(deny->ld_owner.data); } /* * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, * so we do a temporary open here just to get an open file to pass to * vfs_test_lock. */ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { struct nfsd_file *nf; struct inode *inode; __be32 err; err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); if (err) return err; inode = fhp->fh_dentry->d_inode; inode_lock(inode); /* to block new leases till after test_lock: */ err = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (err) goto out; lock->c.flc_file = nf->nf_file; err = nfserrno(vfs_test_lock(nf->nf_file, lock)); lock->c.flc_file = NULL; out: inode_unlock(inode); nfsd_file_put(nf); return err; } /* * LOCKT operation */ __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_lockt *lockt = &u->lockt; struct file_lock *file_lock = NULL; struct nfs4_lockowner *lo = NULL; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (locks_in_grace(SVC_NET(rqstp))) return nfserr_grace; if (check_lock_length(lockt->lt_offset, lockt->lt_length)) return nfserr_inval; if (!nfsd4_has_session(cstate)) { status = set_client(&lockt->lt_clientid, cstate, nn); if (status) goto out; } if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) goto out; file_lock = locks_alloc_lock(); if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; goto out; } switch (lockt->lt_type) { case NFS4_READ_LT: case NFS4_READW_LT: file_lock->c.flc_type = F_RDLCK; break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: file_lock->c.flc_type = F_WRLCK; break; default: dprintk("NFSD: nfs4_lockt: bad lock type!\n"); status = nfserr_inval; goto out; } lo = find_lockowner_str(cstate->clp, &lockt->lt_owner); if (lo) file_lock->c.flc_owner = (fl_owner_t)lo; file_lock->c.flc_pid = current->tgid; file_lock->c.flc_flags = FL_POSIX; file_lock->fl_start = lockt->lt_offset; file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); nfs4_transform_lock_offset(file_lock); status = nfsd_test_lock(rqstp, &cstate->current_fh, file_lock); if (status) goto out; if (file_lock->c.flc_type != F_UNLCK) { status = nfserr_denied; nfs4_set_lock_denied(file_lock, &lockt->lt_denied); } out: if (lo) nfs4_put_stateowner(&lo->lo_owner); if (file_lock) locks_free_lock(file_lock); return status; } void nfsd4_lockt_release(union nfsd4_op_u *u) { struct nfsd4_lockt *lockt = &u->lockt; struct nfsd4_lock_denied *deny = &lockt->lt_denied; kfree(deny->ld_owner.data); } __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_locku *locku = &u->locku; struct nfs4_ol_stateid *stp; struct nfsd_file *nf = NULL; struct file_lock *file_lock = NULL; __be32 status; int err; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n", (long long) locku->lu_offset, (long long) locku->lu_length); if (check_lock_length(locku->lu_offset, locku->lu_length)) return nfserr_inval; status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, &locku->lu_stateid, SC_TYPE_LOCK, 0, &stp, nn); if (status) goto out; nf = find_any_file(stp->st_stid.sc_file); if (!nf) { status = nfserr_lock_range; goto put_stateid; } file_lock = locks_alloc_lock(); if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; goto put_file; } file_lock->c.flc_type = F_UNLCK; file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); file_lock->c.flc_pid = current->tgid; file_lock->c.flc_file = nf->nf_file; file_lock->c.flc_flags = FL_POSIX; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = locku->lu_offset; file_lock->fl_end = last_byte_offset(locku->lu_offset, locku->lu_length); nfs4_transform_lock_offset(file_lock); err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL); if (err) { dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); goto out_nfserr; } nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid); put_file: nfsd_file_put(nf); put_stateid: mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); if (file_lock) locks_free_lock(file_lock); return status; out_nfserr: status = nfserrno(err); goto put_file; } /* * returns * true: locks held by lockowner * false: no locks held by lockowner */ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; struct nfsd_file *nf; struct inode *inode; struct file_lock_context *flctx; spin_lock(&fp->fi_lock); nf = find_any_file_locked(fp); if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); goto out; } inode = file_inode(nf->nf_file); flctx = locks_inode_context(inode); if (flctx && !list_empty_careful(&flctx->flc_posix)) { spin_lock(&flctx->flc_lock); for_each_file_lock(fl, &flctx->flc_posix) { if (fl->c.flc_owner == (fl_owner_t)lowner) { status = true; break; } } spin_unlock(&flctx->flc_lock); } out: spin_unlock(&fp->fi_lock); return status; } /** * nfsd4_release_lockowner - process NFSv4.0 RELEASE_LOCKOWNER operations * @rqstp: RPC transaction * @cstate: NFSv4 COMPOUND state * @u: RELEASE_LOCKOWNER arguments * * Check if there are any locks still held and if not, free the lockowner * and any lock state that is owned. * * Return values: * %nfs_ok: lockowner released or not found * %nfserr_locks_held: lockowner still in use * %nfserr_stale_clientid: clientid no longer active * %nfserr_expired: clientid not recognized */ __be32 nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); clientid_t *clid = &rlockowner->rl_clientid; struct nfs4_ol_stateid *stp; struct nfs4_lockowner *lo; struct nfs4_client *clp; LIST_HEAD(reaplist); __be32 status; dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); status = set_client(clid, cstate, nn); if (status) return status; clp = cstate->clp; spin_lock(&clp->cl_lock); lo = find_lockowner_str_locked(clp, &rlockowner->rl_owner); if (!lo) { spin_unlock(&clp->cl_lock); return nfs_ok; } list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { if (check_for_locks(stp->st_stid.sc_file, lo)) { spin_unlock(&clp->cl_lock); nfs4_put_stateowner(&lo->lo_owner); return nfserr_locks_held; } } unhash_lockowner_locked(lo); while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); unhash_lock_stateid(stp); put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); remove_blocked_locks(lo); nfs4_put_stateowner(&lo->lo_owner); return nfs_ok; } static inline struct nfs4_client_reclaim * alloc_reclaim(void) { return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL); } bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn) { struct nfs4_client_reclaim *crp; crp = nfsd4_find_reclaim_client(name, nn); return (crp && crp->cr_clp); } /* * failure => all reset bets are off, nfserr_no_grace... * * The caller is responsible for freeing name.data if NULL is returned (it * will be freed in nfs4_remove_reclaim_record in the normal case). */ struct nfs4_client_reclaim * nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp; crp = alloc_reclaim(); if (crp) { strhashval = clientstr_hashval(name); INIT_LIST_HEAD(&crp->cr_strhash); list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]); crp->cr_name.data = name.data; crp->cr_name.len = name.len; crp->cr_princhash.data = princhash.data; crp->cr_princhash.len = princhash.len; crp->cr_clp = NULL; nn->reclaim_str_hashtbl_size++; } return crp; } void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn) { list_del(&crp->cr_strhash); kfree(crp->cr_name.data); kfree(crp->cr_princhash.data); kfree(crp); nn->reclaim_str_hashtbl_size--; } void nfs4_release_reclaim(struct nfsd_net *nn) { struct nfs4_client_reclaim *crp = NULL; int i; for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->reclaim_str_hashtbl[i])) { crp = list_entry(nn->reclaim_str_hashtbl[i].next, struct nfs4_client_reclaim, cr_strhash); nfs4_remove_reclaim_record(crp, nn); } } WARN_ON_ONCE(nn->reclaim_str_hashtbl_size); } /* * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ struct nfs4_client_reclaim * nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp = NULL; strhashval = clientstr_hashval(name); list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) { if (compare_blob(&crp->cr_name, &name) == 0) { return crp; } } return NULL; } __be32 nfs4_check_open_reclaim(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) return nfserr_no_grace; if (nfsd4_client_record_check(clp)) return nfserr_reclaim_bad; return nfs_ok; } /* * Since the lifetime of a delegation isn't limited to that of an open, a * client may quite reasonably hang on to a delegation as long as it has * the inode cached. This becomes an obvious problem the first time a * client's inode cache approaches the size of the server's total memory. * * For now we avoid this problem by imposing a hard limit on the number * of delegations, which varies according to the server's memory size. */ static void set_max_delegations(void) { /* * Allow at most 4 delegations per megabyte of RAM. Quick * estimates suggest that in the worst case (where every delegation * is for a different inode), a delegation could take about 1.5K, * giving a worst case usage of about 6% of memory. */ max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT); } static int nfs4_state_create_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; nn->conf_id_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!nn->conf_id_hashtbl) goto err; nn->unconf_id_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!nn->unconf_id_hashtbl) goto err_unconf_id; nn->sessionid_hashtbl = kmalloc_array(SESSION_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!nn->sessionid_hashtbl) goto err_sessionid; for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]); } for (i = 0; i < SESSION_HASH_SIZE; i++) INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; nn->boot_time = ktime_get_real_seconds(); nn->grace_ended = false; nn->nfsd4_manager.block_opens = true; INIT_LIST_HEAD(&nn->nfsd4_manager.list); INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); spin_lock_init(&nn->client_lock); spin_lock_init(&nn->s2s_cp_lock); idr_init(&nn->s2s_cp_stateids); atomic_set(&nn->pending_async_copies, 0); spin_lock_init(&nn->blocked_locks_lock); INIT_LIST_HEAD(&nn->blocked_locks_lru); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); nn->nfsd_client_shrinker = shrinker_alloc(0, "nfsd-client"); if (!nn->nfsd_client_shrinker) goto err_shrinker; nn->nfsd_client_shrinker->scan_objects = nfsd4_state_shrinker_scan; nn->nfsd_client_shrinker->count_objects = nfsd4_state_shrinker_count; nn->nfsd_client_shrinker->private_data = nn; shrinker_register(nn->nfsd_client_shrinker); return 0; err_shrinker: put_net(net); kfree(nn->sessionid_hashtbl); err_sessionid: kfree(nn->unconf_id_hashtbl); err_unconf_id: kfree(nn->conf_id_hashtbl); err: return -ENOMEM; } static void nfs4_state_destroy_net(struct net *net) { int i; struct nfs4_client *clp = NULL; struct nfsd_net *nn = net_generic(net, nfsd_net_id); for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->conf_id_hashtbl[i])) { clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); destroy_client(clp); } } WARN_ON(!list_empty(&nn->blocked_locks_lru)); for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->unconf_id_hashtbl[i])) { clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); destroy_client(clp); } } kfree(nn->sessionid_hashtbl); kfree(nn->unconf_id_hashtbl); kfree(nn->conf_id_hashtbl); put_net(net); } int nfs4_state_start_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; ret = nfs4_state_create_net(net); if (ret) return ret; locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0) goto skip_grace; printk(KERN_INFO "NFSD: starting %lld-second grace period (net %x)\n", nn->nfsd4_grace, net->ns.inum); trace_nfsd_grace_start(nn); queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); return 0; skip_grace: printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n", net->ns.inum); queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ); nfsd4_end_grace(nn); return 0; } /* initialization to perform when the nfsd service is started: */ int nfs4_state_start(void) { int ret; ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params); if (ret) return ret; set_max_delegations(); return 0; } void nfs4_state_shutdown_net(struct net *net) { struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); shrinker_free(nn->nfsd_client_shrinker); cancel_work_sync(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); INIT_LIST_HEAD(&reaplist); spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); destroy_unhashed_deleg(dp); } nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_shutdown_umount(nn); #endif } void nfs4_state_shutdown(void) { rhltable_destroy(&nfs4_file_rhltable); } static void get_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) { if (HAS_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG) && CURRENT_STATEID(stateid)) memcpy(stateid, &cstate->current_stateid, sizeof(stateid_t)); } static void put_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) { if (cstate->minorversion) { memcpy(&cstate->current_stateid, stateid, sizeof(stateid_t)); SET_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG); } } void clear_current_stateid(struct nfsd4_compound_state *cstate) { CLEAR_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG); } /* * functions to set current state id */ void nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { put_stateid(cstate, &u->open_downgrade.od_stateid); } void nfsd4_set_openstateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { put_stateid(cstate, &u->open.op_stateid); } void nfsd4_set_closestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { put_stateid(cstate, &u->close.cl_stateid); } void nfsd4_set_lockstateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { put_stateid(cstate, &u->lock.lk_resp_stateid); } /* * functions to consume current state id */ void nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->open_downgrade.od_stateid); } void nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->delegreturn.dr_stateid); } void nfsd4_get_freestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->free_stateid.fr_stateid); } void nfsd4_get_setattrstateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->setattr.sa_stateid); } void nfsd4_get_closestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->close.cl_stateid); } void nfsd4_get_lockustateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->locku.lu_stateid); } void nfsd4_get_readstateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->read.rd_stateid); } void nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->write.wr_stateid); } /** * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict * @rqstp: RPC transaction context * @dentry: dentry of inode to be checked for a conflict * @pdp: returned WRITE delegation, if one was found * * This function is called when there is a conflict between a write * delegation and a change/size GETATTR from another client. The server * must either use the CB_GETATTR to get the current values of the * attributes from the client that holds the delegation or recall the * delegation before replying to the GETATTR. See RFC 8881 section * 18.7.4. * * Returns 0 if there is no conflict; otherwise an nfs_stat * code is returned. If @pdp is set to a non-NULL value, then the * caller must put the reference. */ __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_delegation **pdp) { __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file_lock_context *ctx; struct nfs4_delegation *dp = NULL; struct file_lease *fl; struct iattr attrs; struct nfs4_cb_fattr *ncf; struct inode *inode = d_inode(dentry); ctx = locks_inode_context(inode); if (!ctx) return nfs_ok; #define NON_NFSD_LEASE ((void *)1) spin_lock(&ctx->flc_lock); for_each_file_lock(fl, &ctx->flc_lease) { if (fl->c.flc_flags == FL_LAYOUT) continue; if (fl->c.flc_type == F_WRLCK) { if (fl->fl_lmops == &nfsd_lease_mng_ops) dp = fl->c.flc_owner; else dp = NON_NFSD_LEASE; } break; } if (dp == NULL || dp == NON_NFSD_LEASE || dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { spin_unlock(&ctx->flc_lock); if (dp == NON_NFSD_LEASE) { status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (status != nfserr_jukebox || !nfsd_wait_for_delegreturn(rqstp, inode)) return status; } return 0; } nfsd_stats_wdeleg_getattr_inc(nn); refcount_inc(&dp->dl_stid.sc_count); ncf = &dp->dl_cb_fattr; nfs4_cb_getattr(&dp->dl_cb_fattr); spin_unlock(&ctx->flc_lock); wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY, TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT); if (ncf->ncf_cb_status) { /* Recall delegation only if client didn't respond */ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (status != nfserr_jukebox || !nfsd_wait_for_delegreturn(rqstp, inode)) goto out_status; } if (!ncf->ncf_file_modified && (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) ncf->ncf_file_modified = true; if (ncf->ncf_file_modified) { int err; /* * Per section 10.4.3 of RFC 8881, the server would * not update the file's metadata with the client's * modified size */ attrs.ia_mtime = attrs.ia_ctime = current_time(inode); attrs.ia_valid = ATTR_MTIME | ATTR_CTIME | ATTR_DELEG; inode_lock(inode); err = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL); inode_unlock(inode); if (err) { status = nfserrno(err); goto out_status; } ncf->ncf_cur_fsize = ncf->ncf_cb_fsize; *pdp = dp; return nfs_ok; } status = nfs_ok; out_status: nfs4_put_stid(&dp->dl_stid); return status; }
77 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 // SPDX-License-Identifier: GPL-2.0 /* Lock down the kernel * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public Licence * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. */ #include <linux/security.h> #include <linux/export.h> #include <linux/lsm_hooks.h> #include <uapi/linux/lsm.h> static enum lockdown_reason kernel_locked_down; static const enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX}; /* * Put the kernel into lock-down mode. */ static int lock_kernel_down(const char *where, enum lockdown_reason level) { if (kernel_locked_down >= level) return -EPERM; kernel_locked_down = level; pr_notice("Kernel is locked down from %s; see man kernel_lockdown.7\n", where); return 0; } static int __init lockdown_param(char *level) { if (!level) return -EINVAL; if (strcmp(level, "integrity") == 0) lock_kernel_down("command line", LOCKDOWN_INTEGRITY_MAX); else if (strcmp(level, "confidentiality") == 0) lock_kernel_down("command line", LOCKDOWN_CONFIDENTIALITY_MAX); else return -EINVAL; return 0; } early_param("lockdown", lockdown_param); /** * lockdown_is_locked_down - Find out if the kernel is locked down * @what: Tag to use in notice generated if lockdown is in effect */ static int lockdown_is_locked_down(enum lockdown_reason what) { if (WARN(what >= LOCKDOWN_CONFIDENTIALITY_MAX, "Invalid lockdown reason")) return -EPERM; if (kernel_locked_down >= what) { if (lockdown_reasons[what]) pr_notice_ratelimited("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n", current->comm, lockdown_reasons[what]); return -EPERM; } return 0; } static struct security_hook_list lockdown_hooks[] __ro_after_init = { LSM_HOOK_INIT(locked_down, lockdown_is_locked_down), }; static const struct lsm_id lockdown_lsmid = { .name = "lockdown", .id = LSM_ID_LOCKDOWN, }; static int __init lockdown_lsm_init(void) { #if defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY) lock_kernel_down("Kernel configuration", LOCKDOWN_INTEGRITY_MAX); #elif defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY) lock_kernel_down("Kernel configuration", LOCKDOWN_CONFIDENTIALITY_MAX); #endif security_add_hooks(lockdown_hooks, ARRAY_SIZE(lockdown_hooks), &lockdown_lsmid); return 0; } static ssize_t lockdown_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char temp[80]; int i, offset = 0; for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { enum lockdown_reason level = lockdown_levels[i]; if (lockdown_reasons[level]) { const char *label = lockdown_reasons[level]; if (kernel_locked_down == level) offset += sprintf(temp+offset, "[%s] ", label); else offset += sprintf(temp+offset, "%s ", label); } } /* Convert the last space to a newline if needed. */ if (offset > 0) temp[offset-1] = '\n'; return simple_read_from_buffer(buf, count, ppos, temp, strlen(temp)); } static ssize_t lockdown_write(struct file *file, const char __user *buf, size_t n, loff_t *ppos) { char *state; int i, len, err = -EINVAL; state = memdup_user_nul(buf, n); if (IS_ERR(state)) return PTR_ERR(state); len = strlen(state); if (len && state[len-1] == '\n') { state[len-1] = '\0'; len--; } for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { enum lockdown_reason level = lockdown_levels[i]; const char *label = lockdown_reasons[level]; if (label && !strcmp(state, label)) err = lock_kernel_down("securityfs", level); } kfree(state); return err ? err : n; } static const struct file_operations lockdown_ops = { .read = lockdown_read, .write = lockdown_write, }; static int __init lockdown_secfs_init(void) { struct dentry *dentry; dentry = securityfs_create_file("lockdown", 0644, NULL, NULL, &lockdown_ops); return PTR_ERR_OR_ZERO(dentry); } core_initcall(lockdown_secfs_init); #ifdef CONFIG_SECURITY_LOCKDOWN_LSM_EARLY DEFINE_EARLY_LSM(lockdown) = { #else DEFINE_LSM(lockdown) = { #endif .name = "lockdown", .init = lockdown_lsm_init, };
32 7 128 15 14 50 7 50 91 41 100 26 16 12 99 100 35 111 35 111 111 111 9 41 41 41 89 89 102 83 111 111 30 111 41 41 41 41 13 28 37 5 31 3 28 5 23 20 28 33 33 33 20 1 24 20 14 6 33 47 47 47 47 3 47 47 27 27 6 21 18 27 89 89 89 57 8 54 89 111 40 75 41 41 5 36 89 89 61 89 48 49 43 7 33 33 48 97 32 59 59 59 47 59 45 12 59 136 135 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 /* * net/tipc/name_table.c: TIPC name table code * * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2008, 2010-2014, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/sock.h> #include <linux/list_sort.h> #include <linux/rbtree_augmented.h> #include "core.h" #include "netlink.h" #include "name_table.h" #include "name_distr.h" #include "subscr.h" #include "bcast.h" #include "addr.h" #include "node.h" #include "group.h" /** * struct service_range - container for all bindings of a service range * @lower: service range lower bound * @upper: service range upper bound * @tree_node: member of service range RB tree * @max: largest 'upper' in this node subtree * @local_publ: list of identical publications made from this node * Used by closest_first lookup and multicast lookup algorithm * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm */ struct service_range { u32 lower; u32 upper; struct rb_node tree_node; u32 max; struct list_head local_publ; struct list_head all_publ; }; /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type * @lock: spinlock controlling access to pertaining service ranges/publications * @rcu: RCU callback head used for deferred freeing */ struct tipc_service { u32 type; u32 publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; spinlock_t lock; /* Covers service range list */ struct rcu_head rcu; }; #define service_range_upper(sr) ((sr)->upper) RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, struct service_range, tree_node, u32, max, service_range_upper) #define service_range_entry(rbtree_node) \ (container_of(rbtree_node, struct service_range, tree_node)) #define service_range_overlap(sr, start, end) \ ((sr)->lower <= (end) && (sr)->upper >= (start)) /** * service_range_foreach_match - iterate over tipc service rbtree for each * range match * @sr: the service range pointer as a loop cursor * @sc: the pointer to tipc service which holds the service range rbtree * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching */ #define service_range_foreach_match(sr, sc, start, end) \ for (sr = service_range_match_first((sc)->ranges.rb_node, \ start, \ end); \ sr; \ sr = service_range_match_next(&(sr)->tree_node, \ start, \ end)) /** * service_range_match_first - find first service range matching a range * @n: the root node of service range rbtree for searching * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the leftmost service range node in the rbtree that overlaps the * specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_first(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *l, *r; /* Non overlaps in tree at all? */ if (!n || service_range_entry(n)->max < start) return NULL; while (n) { l = n->rb_left; if (l && service_range_entry(l)->max >= start) { /* A leftmost overlap range node must be one in the left * subtree. If not, it has lower > end, then nodes on * the right side cannot satisfy the condition either. */ n = l; continue; } /* No one in the left subtree can match, return if this node is * an overlap i.e. leftmost. */ sr = service_range_entry(n); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup on the right side */ r = n->rb_right; if (sr->lower <= end && r && service_range_entry(r)->max >= start) { n = r; continue; } break; } return NULL; } /** * service_range_match_next - find next service range matching a range * @n: a node in service range rbtree from which the searching starts * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the next service range node to the given node in the rbtree that * overlaps the specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_next(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *p, *r; while (n) { r = n->rb_right; if (r && service_range_entry(r)->max >= start) /* A next overlap range node must be one in the right * subtree. If not, it has lower > end, then any next * successor (- an ancestor) of this node cannot * satisfy the condition either. */ return service_range_match_first(r, start, end); /* No one in the right subtree can match, go up to find an * ancestor of this node which is parent of a left-hand child. */ while ((p = rb_parent(n)) && n == p->rb_right) n = p; if (!p) break; /* Return if this ancestor is an overlap */ sr = service_range_entry(p); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup more from this ancestor */ if (sr->lower <= end) { n = p; continue; } break; } return NULL; } static int hash(int x) { return x & (TIPC_NAMETBL_SIZE - 1); } /** * tipc_publ_create - create a publication structure * @ua: the service range the user is binding to * @sk: the address of the socket that is bound * @key: publication key */ static struct publication *tipc_publ_create(struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct publication *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return NULL; p->sr = ua->sr; p->sk = *sk; p->scope = ua->scope; p->key = key; INIT_LIST_HEAD(&p->binding_sock); INIT_LIST_HEAD(&p->binding_node); INIT_LIST_HEAD(&p->local_publ); INIT_LIST_HEAD(&p->all_publ); INIT_LIST_HEAD(&p->list); return p; } /** * tipc_service_create - create a service structure for the specified 'type' * @net: network namespace * @ua: address representing the service to be bound * * Allocates a single range structure and sets it to all 0's. */ static struct tipc_service *tipc_service_create(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct tipc_service *service; struct hlist_head *hd; service = kzalloc(sizeof(*service), GFP_ATOMIC); if (!service) { pr_warn("Service creation failed, no memory\n"); return NULL; } spin_lock_init(&service->lock); service->type = ua->sr.type; service->ranges = RB_ROOT; INIT_HLIST_NODE(&service->service_list); INIT_LIST_HEAD(&service->subscriptions); hd = &nt->services[hash(ua->sr.type)]; hlist_add_head_rcu(&service->service_list, hd); return service; } /* tipc_service_find_range - find service range matching publication parameters */ static struct service_range *tipc_service_find_range(struct tipc_service *sc, struct tipc_uaddr *ua) { struct service_range *sr; service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { /* Look for exact match */ if (sr->lower == ua->sr.lower && sr->upper == ua->sr.upper) return sr; } return NULL; } static struct service_range *tipc_service_create_range(struct tipc_service *sc, struct publication *p) { struct rb_node **n, *parent = NULL; struct service_range *sr; u32 lower = p->sr.lower; u32 upper = p->sr.upper; n = &sc->ranges.rb_node; while (*n) { parent = *n; sr = service_range_entry(parent); if (lower == sr->lower && upper == sr->upper) return sr; if (sr->max < upper) sr->max = upper; if (lower <= sr->lower) n = &parent->rb_left; else n = &parent->rb_right; } sr = kzalloc(sizeof(*sr), GFP_ATOMIC); if (!sr) return NULL; sr->lower = lower; sr->upper = upper; sr->max = upper; INIT_LIST_HEAD(&sr->local_publ); INIT_LIST_HEAD(&sr->all_publ); rb_link_node(&sr->tree_node, parent, n); rb_insert_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); return sr; } static bool tipc_service_insert_publ(struct net *net, struct tipc_service *sc, struct publication *p) { struct tipc_subscription *sub, *tmp; struct service_range *sr; struct publication *_p; u32 node = p->sk.node; bool first = false; bool res = false; u32 key = p->key; spin_lock_bh(&sc->lock); sr = tipc_service_create_range(sc, p); if (!sr) goto exit; first = list_empty(&sr->all_publ); /* Return if the publication already exists */ list_for_each_entry(_p, &sr->all_publ, all_publ) { if (_p->key == key && (!_p->sk.node || _p->sk.node == node)) { pr_debug("Failed to bind duplicate %u,%u,%u/%u:%u/%u\n", p->sr.type, p->sr.lower, p->sr.upper, node, p->sk.ref, key); goto exit; } } if (in_own_node(net, p->sk.node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); p->id = sc->publ_cnt++; /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, first); } res = true; exit: if (!res) pr_warn("Failed to bind to %u,%u,%u\n", p->sr.type, p->sr.lower, p->sr.upper); spin_unlock_bh(&sc->lock); return res; } /** * tipc_service_remove_publ - remove a publication from a service * @r: service_range to remove publication from * @sk: address publishing socket * @key: target publication key */ static struct publication *tipc_service_remove_publ(struct service_range *r, struct tipc_socket_addr *sk, u32 key) { struct publication *p; u32 node = sk->node; list_for_each_entry(p, &r->all_publ, all_publ) { if (p->key != key || (node && node != p->sk.node)) continue; list_del(&p->all_publ); list_del(&p->local_publ); return p; } return NULL; } /* * Code reused: time_after32() for the same purpose */ #define publication_after(pa, pb) time_after32((pa)->id, (pb)->id) static int tipc_publ_sort(void *priv, const struct list_head *a, const struct list_head *b) { struct publication *pa, *pb; pa = container_of(a, struct publication, list); pb = container_of(b, struct publication, list); return publication_after(pa, pb); } /** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service * range overlapping with the requested range * @service: the tipc_service to attach the @sub to * @sub: the subscription to attach */ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct publication *p, *first, *tmp; struct list_head publ_list; struct service_range *sr; u32 filter, lower, upper; filter = sub->s.filter; lower = sub->s.seq.lower; upper = sub->s.seq.upper; tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); if (filter & TIPC_SUB_NO_STATUS) return; INIT_LIST_HEAD(&publ_list); service_range_foreach_match(sr, service, lower, upper) { first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { if (filter & TIPC_SUB_PORTS) list_add_tail(&p->list, &publ_list); else if (!first || publication_after(first, p)) /* Pick this range's *first* publication */ first = p; } if (first) list_add_tail(&first->list, &publ_list); } /* Sort the publications before reporting */ list_sort(NULL, &publ_list, tipc_publ_sort); list_for_each_entry_safe(p, tmp, &publ_list, list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, true); list_del_init(&p->list); } } static struct tipc_service *tipc_service_find(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct hlist_head *service_head; struct tipc_service *service; service_head = &nt->services[hash(ua->sr.type)]; hlist_for_each_entry_rcu(service, service_head, service_list) { if (service->type == ua->sr.type) return service; } return NULL; }; struct publication *tipc_nametbl_insert_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_service *sc; struct publication *p; p = tipc_publ_create(ua, sk, key); if (!p) return NULL; sc = tipc_service_find(net, ua); if (!sc) sc = tipc_service_create(net, ua); if (sc && tipc_service_insert_publ(net, sc, p)) return p; kfree(p); return NULL; } struct publication *tipc_nametbl_remove_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_subscription *sub, *tmp; struct publication *p = NULL; struct service_range *sr; struct tipc_service *sc; bool last; sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); sr = tipc_service_find_range(sc, ua); if (!sr) goto unlock; p = tipc_service_remove_publ(sr, sk, key); if (!p) goto unlock; /* Notify any waiting subscriptions */ last = list_empty(&sr->all_publ); list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_WITHDRAWN, last); } /* Remove service range item if this was its last publication */ if (list_empty(&sr->all_publ)) { rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } unlock: spin_unlock_bh(&sc->lock); exit: if (!p) { pr_err("Failed to remove unknown binding: %u,%u,%u/%u:%u/%u\n", ua->sr.type, ua->sr.lower, ua->sr.upper, sk->node, sk->ref, key); } return p; } /** * tipc_nametbl_lookup_anycast - perform service instance to socket translation * @net: network namespace * @ua: service address to look up * @sk: address to socket we want to find * * On entry, a non-zero 'sk->node' indicates the node where we want lookup to be * performed, which may not be this one. * * On exit: * * - If lookup is deferred to another node, leave 'sk->node' unchanged and * return 'true'. * - If lookup is successful, set the 'sk->node' and 'sk->ref' (== portid) which * represent the bound socket and return 'true'. * - If lookup fails, return 'false' * * Note that for legacy users (node configured with Z.C.N address format) the * 'closest-first' lookup algorithm must be maintained, i.e., if sk.node is 0 * we must look in the local binding list first */ bool tipc_nametbl_lookup_anycast(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk) { struct tipc_net *tn = tipc_net(net); bool legacy = tn->legacy_addr_format; u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *r; struct tipc_service *sc; struct publication *p; struct list_head *l; bool res = false; if (!tipc_in_scope(legacy, sk->node, self)) return true; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(r, sc, inst, inst) { /* Select lookup algo: local, closest-first or round-robin */ if (sk->node == self) { l = &r->local_publ; if (list_empty(l)) continue; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else if (legacy && !sk->node && !list_empty(&r->local_publ)) { l = &r->local_publ; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else { l = &r->all_publ; p = list_first_entry(l, struct publication, all_publ); list_move_tail(&p->all_publ, &r->all_publ); } *sk = p->sk; res = true; /* Todo: as for legacy, pick the first matching range only, a * "true" round-robin will be performed as needed. */ break; } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return res; } /* tipc_nametbl_lookup_group(): lookup destinaton(s) in a communication group * Returns a list of one (== group anycast) or more (== group multicast) * destination socket/node pairs matching the given address. * The requester may or may not want to exclude himself from the list. */ bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua, struct list_head *dsts, int *dstcnt, u32 exclude, bool mcast) { u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *sr; struct tipc_service *sc; struct publication *p; *dstcnt = 0; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); /* Todo: a full search i.e. service_range_foreach_match() instead? */ sr = service_range_match_first(sc->ranges.rb_node, inst, inst); if (!sr) goto no_match; list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; if (p->sk.ref == exclude && p->sk.node == self) continue; tipc_dest_push(dsts, p->sk.node, p->sk.ref); (*dstcnt)++; if (mcast) continue; list_move_tail(&p->all_publ, &sr->all_publ); break; } no_match: spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return !list_empty(dsts); } /* tipc_nametbl_lookup_mcast_sockets(): look up node local destinaton sockets * matching the given address * Used on nodes which have received a multicast/broadcast message * Returns a list of local sockets */ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, struct list_head *dports) { struct service_range *sr; struct tipc_service *sc; struct publication *p; u8 scope = ua->scope; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { if (scope == p->scope || scope == TIPC_ANY_SCOPE) tipc_dest_push(dports, 0, p->sk.ref); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_lookup_mcast_nodes(): look up all destination nodes matching * the given address. Used in sending node. * Used on nodes which are sending out a multicast/broadcast message * Returns a list of nodes, including own node if applicable */ void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua, struct tipc_nlist *nodes) { struct service_range *sr; struct tipc_service *sc; struct publication *p; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->all_publ, all_publ) { tipc_nlist_add(nodes, p->sk.node); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_build_group - build list of communication group members */ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, struct tipc_uaddr *ua) { struct service_range *sr; struct tipc_service *sc; struct publication *p; struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; tipc_group_add_member(grp, p->sk.node, p->sk.ref, p->sr.lower); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_publish - add service binding to name table */ struct publication *tipc_nametbl_publish(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *p = NULL; struct sk_buff *skb = NULL; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); if (nt->local_publ_count >= TIPC_MAX_PUBL) { pr_warn("Bind failed, max limit %u reached\n", TIPC_MAX_PUBL); goto exit; } p = tipc_nametbl_insert_publ(net, ua, sk, key); if (p) { nt->local_publ_count++; skb = tipc_named_publish(net, p); } rc_dests = nt->rc_dests; exit: spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); return p; } /** * tipc_nametbl_withdraw - withdraw a service binding * @net: network namespace * @ua: service address/range being unbound * @sk: address of the socket being unbound from * @key: target publication key */ void tipc_nametbl_withdraw(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct sk_buff *skb = NULL; struct publication *p; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); p = tipc_nametbl_remove_publ(net, ua, sk, key); if (p) { nt->local_publ_count--; skb = tipc_named_withdraw(net, p); list_del_init(&p->binding_sock); kfree_rcu(p, rcu); } rc_dests = nt->rc_dests; spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); } /** * tipc_nametbl_subscribe - add a subscription object to the name table * @sub: subscription to add */ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); u32 type = sub->s.seq.type; struct tipc_service *sc; struct tipc_uaddr ua; bool res = true; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) sc = tipc_service_create(sub->net, &ua); if (sc) { spin_lock_bh(&sc->lock); tipc_service_subscribe(sc, sub); spin_unlock_bh(&sc->lock); } else { pr_warn("Failed to subscribe for {%u,%u,%u}\n", type, sub->s.seq.lower, sub->s.seq.upper); res = false; } spin_unlock_bh(&tn->nametbl_lock); return res; } /** * tipc_nametbl_unsubscribe - remove a subscription object from name table * @sub: subscription to remove */ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); struct tipc_service *sc; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, sub->s.seq.type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); list_del_init(&sub->service_list); tipc_sub_put(sub); /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } spin_unlock_bh(&sc->lock); exit: spin_unlock_bh(&tn->nametbl_lock); } int tipc_nametbl_init(struct net *net) { struct tipc_net *tn = tipc_net(net); struct name_table *nt; int i; nt = kzalloc(sizeof(*nt), GFP_KERNEL); if (!nt) return -ENOMEM; for (i = 0; i < TIPC_NAMETBL_SIZE; i++) INIT_HLIST_HEAD(&nt->services[i]); INIT_LIST_HEAD(&nt->node_scope); INIT_LIST_HEAD(&nt->cluster_scope); rwlock_init(&nt->cluster_scope_lock); tn->nametbl = nt; spin_lock_init(&tn->nametbl_lock); return 0; } /** * tipc_service_delete - purge all publications for a service and delete it * @net: the associated network namespace * @sc: tipc_service to delete */ static void tipc_service_delete(struct net *net, struct tipc_service *sc) { struct service_range *sr, *tmpr; struct publication *p, *tmp; spin_lock_bh(&sc->lock); rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) { tipc_service_remove_publ(sr, &p->sk, p->key); kfree_rcu(p, rcu); } rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } hlist_del_init_rcu(&sc->service_list); spin_unlock_bh(&sc->lock); kfree_rcu(sc, rcu); } void tipc_nametbl_stop(struct net *net) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct hlist_head *service_head; struct tipc_service *service; u32 i; /* Verify name table is empty and purge any lingering * publications, then release the name table */ spin_lock_bh(&tn->nametbl_lock); for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { if (hlist_empty(&nt->services[i])) continue; service_head = &nt->services[i]; hlist_for_each_entry_rcu(service, service_head, service_list) { tipc_service_delete(net, service); } } spin_unlock_bh(&tn->nametbl_lock); synchronize_net(); kfree(nt); } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, struct tipc_service *service, struct service_range *sr, u32 *last_key) { struct publication *p; struct nlattr *attrs; struct nlattr *b; void *hdr; if (*last_key) { list_for_each_entry(p, &sr->all_publ, all_publ) if (p->key == *last_key) break; if (list_entry_is_head(p, &sr->all_publ, all_publ)) return -EPIPE; } else { p = list_first_entry(&sr->all_publ, struct publication, all_publ); } list_for_each_entry_from(p, &sr->all_publ, all_publ) { *last_key = p->key; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NAME_TABLE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE); if (!attrs) goto msg_full; b = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); if (!b) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, service->type)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sr->lower)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sr->upper)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->sk.node)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->sk.ref)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key)) goto publ_msg_full; nla_nest_end(msg->skb, b); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); } *last_key = 0; return 0; publ_msg_full: nla_nest_cancel(msg->skb, b); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static int __tipc_nl_service_range_list(struct tipc_nl_msg *msg, struct tipc_service *sc, u32 *last_lower, u32 *last_key) { struct service_range *sr; struct rb_node *n; int err; for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); if (sr->lower < *last_lower) continue; err = __tipc_nl_add_nametable_publ(msg, sc, sr, last_key); if (err) { *last_lower = sr->lower; return err; } } *last_lower = 0; return 0; } static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, u32 *last_type, u32 *last_lower, u32 *last_key) { struct tipc_net *tn = tipc_net(net); struct tipc_service *service = NULL; struct hlist_head *head; struct tipc_uaddr ua; int err; int i; if (*last_type) i = hash(*last_type); else i = 0; for (; i < TIPC_NAMETBL_SIZE; i++) { head = &tn->nametbl->services[i]; if (*last_type || (!i && *last_key && (*last_lower == *last_key))) { tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, *last_type, *last_lower, *last_lower); service = tipc_service_find(net, &ua); if (!service) return -EPIPE; } else { hlist_for_each_entry_rcu(service, head, service_list) break; if (!service) continue; } hlist_for_each_entry_from_rcu(service, service_list) { spin_lock_bh(&service->lock); err = __tipc_nl_service_range_list(msg, service, last_lower, last_key); if (err) { *last_type = service->type; spin_unlock_bh(&service->lock); return err; } spin_unlock_bh(&service->lock); } *last_type = 0; } return 0; } int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 last_type = cb->args[0]; u32 last_lower = cb->args[1]; u32 last_key = cb->args[2]; int done = cb->args[3]; struct tipc_nl_msg msg; int err; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); err = tipc_nl_service_list(net, &msg, &last_type, &last_lower, &last_key); if (!err) { done = 1; } else if (err != -EMSGSIZE) { /* We never set seq or call nl_dump_check_consistent() this * means that setting prev_seq here will cause the consistence * check to fail in the netlink callback handler. Resulting in * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if * we got an error. */ cb->prev_seq = 1; } rcu_read_unlock(); cb->args[0] = last_type; cb->args[1] = last_lower; cb->args[2] = last_key; cb->args[3] = done; return skb->len; } struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; list_for_each_entry(dst, l, list) { if (dst->node == node && dst->port == port) return dst; } return NULL; } bool tipc_dest_push(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; if (tipc_dest_find(l, node, port)) return false; dst = kmalloc(sizeof(*dst), GFP_ATOMIC); if (unlikely(!dst)) return false; dst->node = node; dst->port = port; list_add(&dst->list, l); return true; } bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port) { struct tipc_dest *dst; if (list_empty(l)) return false; dst = list_first_entry(l, typeof(*dst), list); if (port) *port = dst->port; if (node) *node = dst->node; list_del(&dst->list); kfree(dst); return true; } bool tipc_dest_del(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; dst = tipc_dest_find(l, node, port); if (!dst) return false; list_del(&dst->list); kfree(dst); return true; } void tipc_dest_list_purge(struct list_head *l) { struct tipc_dest *dst, *tmp; list_for_each_entry_safe(dst, tmp, l, list) { list_del(&dst->list); kfree(dst); } }
239 239 16 5 12 9 4 3 4 16 16 8 16 165 165 165 165 134 134 133 134 16 16 16 16 16 16 10 10 16 14 10 16 16 16 16 16 16 16 10 16 10 16 16 16 16 16 16 13 4 9 8 1 8 8 8 3 3 7 5 5 5 8 7 1 7 3 3 4 4 3 2 1 1 1 3 36 36 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2020, Red Hat, Inc. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/inet.h> #include <linux/kernel.h> #include <net/inet_common.h> #include <net/netns/generic.h> #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include "mptcp_pm_gen.h" static int pm_nl_pernet_id; struct mptcp_pm_add_entry { struct list_head list; struct mptcp_addr_info addr; u8 retrans_times; struct timer_list add_timer; struct mptcp_sock *sock; }; struct pm_nl_pernet { /* protects pernet updates */ spinlock_t lock; struct list_head local_addr_list; unsigned int addrs; unsigned int stale_loss_cnt; unsigned int add_addr_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; unsigned int subflows_max; unsigned int next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); }; #define MPTCP_PM_ADDR_MAX 8 #define ADD_ADDR_RETRANS_MAX 3 static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) { return net_generic(net, pm_nl_pernet_id); } static struct pm_nl_pernet * pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) { return pm_nl_get_pernet(sock_net((struct sock *)msk)); } bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port) { bool addr_equals = false; if (a->family == b->family) { if (a->family == AF_INET) addr_equals = a->addr.s_addr == b->addr.s_addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); } else if (a->family == AF_INET) { if (ipv6_addr_v4mapped(&b->addr6)) addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; } else if (b->family == AF_INET) { if (ipv6_addr_v4mapped(&a->addr6)) addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; #endif } if (!addr_equals) return false; if (!use_port) return true; return a->port == b->port; } void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { addr->family = skc->skc_family; addr->port = htons(skc->skc_num); if (addr->family == AF_INET) addr->addr.s_addr = skc->skc_rcv_saddr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) addr->addr6 = skc->skc_v6_rcv_saddr; #endif } static void remote_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { addr->family = skc->skc_family; addr->port = skc->skc_dport; if (addr->family == AF_INET) addr->addr.s_addr = skc->skc_daddr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) addr->addr6 = skc->skc_v6_daddr; #endif } static bool lookup_subflow_by_saddr(const struct list_head *list, const struct mptcp_addr_info *saddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; struct sock_common *skc; list_for_each_entry(subflow, list, node) { skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); mptcp_local_address(skc, &cur); if (mptcp_addresses_equal(&cur, saddr, saddr->port)) return true; } return false; } static bool lookup_subflow_by_daddr(const struct list_head *list, const struct mptcp_addr_info *daddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; list_for_each_entry(subflow, list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV))) continue; remote_address((struct sock_common *)ssk, &cur); if (mptcp_addresses_equal(&cur, daddr, daddr->port)) return true; } return false; } static bool select_local_address(const struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, struct mptcp_pm_local *new_local) { struct mptcp_pm_addr_entry *entry; bool found = false; msk_owned_by_me(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; new_local->addr = entry->addr; new_local->flags = entry->flags; new_local->ifindex = entry->ifindex; found = true; break; } rcu_read_unlock(); return found; } static bool select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, struct mptcp_pm_local *new_local) { struct mptcp_pm_addr_entry *entry; bool found = false; rcu_read_lock(); /* do not keep any additional per socket state, just signal * the address list in order. * Note: removal from the local address list during the msk life-cycle * can lead to additional addresses not being announced. */ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) continue; new_local->addr = entry->addr; new_local->flags = entry->flags; new_local->ifindex = entry->ifindex; found = true; break; } rcu_read_unlock(); return found; } unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) { const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->add_addr_signal_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->add_addr_accept_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->subflows_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->local_addr_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); return false; } return true; } struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; lockdep_assert_held(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { if (mptcp_addresses_equal(&entry->addr, addr, true)) return entry; } return NULL; } bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) { struct mptcp_pm_add_entry *entry; struct mptcp_addr_info saddr; bool ret = false; mptcp_local_address((struct sock_common *)sk, &saddr); spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { ret = true; goto out; } } out: spin_unlock_bh(&msk->pm.lock); return ret; } static void mptcp_pm_add_timer(struct timer_list *timer) { struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; pr_debug("msk=%p\n", msk); if (!msk) return; if (inet_sk_state_load(sk) == TCP_CLOSE) return; if (!entry->addr.id) return; if (mptcp_pm_should_add_signal_addr(msk)) { sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); goto out; } spin_lock_bh(&msk->pm.lock); if (!mptcp_pm_should_add_signal_addr(msk)) { pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) sk_reset_timer(sk, timer, jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); spin_unlock_bh(&msk->pm.lock); if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) mptcp_pm_subflow_established(msk); out: __sock_put(sk); } struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id) { struct mptcp_pm_add_entry *entry; struct sock *sk = (struct sock *)msk; struct timer_list *add_timer = NULL; spin_lock_bh(&msk->pm.lock); entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (entry && (!check_id || entry->addr.id == addr->id)) { entry->retrans_times = ADD_ADDR_RETRANS_MAX; add_timer = &entry->add_timer; } if (!check_id && entry) list_del(&entry->list); spin_unlock_bh(&msk->pm.lock); /* no lock, because sk_stop_timer_sync() is calling del_timer_sync() */ if (add_timer) sk_stop_timer_sync(sk, add_timer); return entry; } bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; struct net *net = sock_net(sk); lockdep_assert_held(&msk->pm.lock); add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (add_entry) { if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) return false; sk_reset_timer(sk, &add_entry->add_timer, jiffies + mptcp_get_add_addr_timeout(net)); return true; } add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); if (!add_entry) return false; list_add(&add_entry->list, &msk->pm.anno_list); add_entry->addr = *addr; add_entry->sock = msk; add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); sk_reset_timer(sk, &add_entry->add_timer, jiffies + mptcp_get_add_addr_timeout(net)); return true; } void mptcp_pm_free_anno_list(struct mptcp_sock *msk) { struct mptcp_pm_add_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); pr_debug("msk=%p\n", msk); spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.anno_list, &free_list); spin_unlock_bh(&msk->pm.lock); list_for_each_entry_safe(entry, tmp, &free_list, list) { sk_stop_timer_sync(sk, &entry->add_timer); kfree(entry); } } /* Fill all the remote addresses into the array addrs[], * and return the array size. */ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *local, bool fullmesh, struct mptcp_addr_info *addrs) { bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); struct sock *sk = (struct sock *)msk, *ssk; struct mptcp_subflow_context *subflow; struct mptcp_addr_info remote = { 0 }; unsigned int subflows_max; int i = 0; subflows_max = mptcp_pm_get_subflows_max(msk); remote_address((struct sock_common *)sk, &remote); /* Non-fullmesh endpoint, fill in the single entry * corresponding to the primary MPC subflow remote address */ if (!fullmesh) { if (deny_id0) return 0; if (!mptcp_pm_addr_families_match(sk, local, &remote)) return 0; msk->pm.subflows++; addrs[i++] = remote; } else { DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); /* Forbid creation of new subflows matching existing * ones, possibly already created by incoming ADD_ADDR */ bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); mptcp_for_each_subflow(msk, subflow) if (READ_ONCE(subflow->local_id) == local->id) __set_bit(subflow->remote_id, unavail_id); mptcp_for_each_subflow(msk, subflow) { ssk = mptcp_subflow_tcp_sock(subflow); remote_address((struct sock_common *)ssk, &addrs[i]); addrs[i].id = READ_ONCE(subflow->remote_id); if (deny_id0 && !addrs[i].id) continue; if (test_bit(addrs[i].id, unavail_id)) continue; if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) continue; if (msk->pm.subflows < subflows_max) { /* forbid creating multiple address towards * this id */ __set_bit(addrs[i].id, unavail_id); msk->pm.subflows++; i++; } } } return i; } static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, bool prio, bool backup) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow; pr_debug("send ack for %s\n", prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); slow = lock_sock_fast(ssk); if (prio) { subflow->send_mp_prio = 1; subflow->request_bkup = backup; } __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } static void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, bool prio, bool backup) { spin_unlock_bh(&msk->pm.lock); __mptcp_pm_send_ack(msk, subflow, prio, backup); spin_lock_bh(&msk->pm.lock); } static struct mptcp_pm_addr_entry * __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) { struct mptcp_pm_addr_entry *entry; list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, lockdep_is_held(&pernet->lock)) { if (entry->addr.id == id) return entry; } return NULL; } static struct mptcp_pm_addr_entry * __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) { struct mptcp_pm_addr_entry *entry; list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, lockdep_is_held(&pernet->lock)) { if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) return entry; } return NULL; } static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; unsigned int add_addr_signal_max; bool signal_and_subflow = false; unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; unsigned int subflows_max; pernet = pm_nl_get_pernet(sock_net(sk)); add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); /* do lazy endpoint usage accounting for the MPC subflows */ if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); struct mptcp_pm_addr_entry *entry; struct mptcp_addr_info mpc_addr; bool backup = false; mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); rcu_read_lock(); entry = __lookup_addr(pernet, &mpc_addr); if (entry) { __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); msk->mpc_endpoint_id = entry->addr.id; backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); } rcu_read_unlock(); if (backup) mptcp_pm_send_ack(msk, subflow, true, backup); msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); } pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, msk->pm.subflows, subflows_max); /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the * corresponding id will be marked as used. * Instead let the PM machinery reschedule us when the * current address announce will be completed. */ if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) return; if (!select_signal_address(pernet, msk, &local)) goto subflow; /* If the alloc fails, we are on memory pressure, not worth * continuing, and trying to create subflows. */ if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) return; __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; /* Special case for ID0: set the correct ID */ if (local.addr.id == msk->mpc_endpoint_id) local.addr.id = 0; mptcp_pm_announce_addr(msk, &local.addr, false); mptcp_pm_nl_addr_send_ack(msk); if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) signal_and_subflow = true; } subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && msk->pm.subflows < subflows_max) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; if (signal_and_subflow) signal_and_subflow = false; else if (!select_local_address(pernet, msk, &local)) break; fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH); __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); /* Special case for ID0: set the correct ID */ if (local.addr.id == msk->mpc_endpoint_id) local.addr.id = 0; else /* local_addr_used is not decr for ID 0 */ msk->pm.local_addr_used++; nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); if (nr == 0) continue; spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) __mptcp_subflow_connect(sk, &local, &addrs[i]); spin_lock_bh(&msk->pm.lock); } mptcp_pm_nl_check_work_pending(msk); } static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) { mptcp_pm_create_subflow_or_signal_addr(msk); } static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) { mptcp_pm_create_subflow_or_signal_addr(msk); } /* Fill all the local addresses into the array addrs[], * and return the array size. */ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, struct mptcp_pm_local *locals) { struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; struct mptcp_addr_info mpc_addr; struct pm_nl_pernet *pernet; unsigned int subflows_max; int i = 0; pernet = pm_nl_get_pernet_from_msk(msk); subflows_max = mptcp_pm_get_subflows_max(msk); mptcp_local_address((struct sock_common *)msk, &mpc_addr); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) continue; if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) continue; if (msk->pm.subflows < subflows_max) { locals[i].addr = entry->addr; locals[i].flags = entry->flags; locals[i].ifindex = entry->ifindex; /* Special case for ID0: set the correct ID */ if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) locals[i].addr.id = 0; msk->pm.subflows++; i++; } } rcu_read_unlock(); /* If the array is empty, fill in the single * 'IPADDRANY' local address */ if (!i) { memset(&locals[i], 0, sizeof(locals[i])); locals[i].addr.family = #if IS_ENABLED(CONFIG_MPTCP_IPV6) remote->family == AF_INET6 && ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : #endif remote->family; if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) return 0; msk->pm.subflows++; i++; } return i; } static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; unsigned int add_addr_accept_max; struct mptcp_addr_info remote; unsigned int subflows_max; bool sf_created = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); pr_debug("accepted %d:%d remote family %d\n", msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); remote = msk->pm.remote; mptcp_pm_announce_addr(msk, &remote, true); mptcp_pm_nl_addr_send_ack(msk); if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) return; /* pick id 0 port, if none is provided the remote address */ if (!remote.port) remote.port = sk->sk_dport; /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ nr = fill_local_addresses_vec(msk, &remote, locals); if (nr == 0) return; spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0) sf_created = true; spin_lock_bh(&msk->pm.lock); if (sf_created) { /* add_addr_accepted is not decr for ID 0 */ if (remote.id) msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || msk->pm.subflows >= subflows_max) WRITE_ONCE(msk->pm.accept_addr, false); } } bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *remote) { struct mptcp_addr_info mpc_remote; remote_address((struct sock_common *)msk, &mpc_remote); return mptcp_addresses_equal(&mpc_remote, remote, remote->port); } void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *alt = NULL; msk_owned_by_me(msk); lockdep_assert_held(&msk->pm.lock); if (!mptcp_pm_should_add_signal(msk) && !mptcp_pm_should_rm_signal(msk)) return; mptcp_for_each_subflow(msk, subflow) { if (__mptcp_subflow_active(subflow)) { if (!subflow->stale) { mptcp_pm_send_ack(msk, subflow, false, false); return; } if (!alt) alt = subflow; } } if (alt) mptcp_pm_send_ack(msk, alt, false, false); } int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, struct mptcp_addr_info *rem, u8 bkup) { struct mptcp_subflow_context *subflow; pr_debug("bkup=%d\n", bkup); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct mptcp_addr_info local, remote; mptcp_local_address((struct sock_common *)ssk, &local); if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; if (rem && rem->family != AF_UNSPEC) { remote_address((struct sock_common *)ssk, &remote); if (!mptcp_addresses_equal(&remote, rem, rem->port)) continue; } __mptcp_pm_send_ack(msk, subflow, true, bkup); return 0; } return -EINVAL; } static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list, enum linux_mptcp_mib_field rm_type) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; u8 i; pr_debug("%s rm_list_nr %d\n", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); msk_owned_by_me(msk); if (sk->sk_state == TCP_LISTEN) return; if (!rm_list->nr) return; if (list_empty(&msk->conn_list)) return; for (i = 0; i < rm_list->nr; i++) { u8 rm_id = rm_list->ids[i]; bool removed = false; mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); u8 remote_id = READ_ONCE(subflow->remote_id); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; u8 id = subflow_get_local_id(subflow); if ((1 << inet_sk_state_load(ssk)) & (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE)) continue; if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) continue; if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) continue; pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", i, rm_id, id, remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); removed |= subflow->request_join; /* the following takes care of updating the subflows counter */ mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); if (rm_type == MPTCP_MIB_RMSUBFLOW) __MPTCP_INC_STATS(sock_net(sk), rm_type); } if (rm_type == MPTCP_MIB_RMADDR) __MPTCP_INC_STATS(sock_net(sk), rm_type); if (!removed) continue; if (!mptcp_pm_is_kernel(msk)) continue; if (rm_type == MPTCP_MIB_RMADDR && rm_id && !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { /* Note: if the subflow has been closed before, this * add_addr_accepted counter will not be decremented. */ if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) WRITE_ONCE(msk->pm.accept_addr, true); } } } static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) { mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); } static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); } void mptcp_pm_nl_work(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; msk_owned_by_me(msk); if (!(pm->status & MPTCP_PM_WORK_MASK)) return; spin_lock_bh(&msk->pm.lock); pr_debug("msk=%p status=%x\n", msk, pm->status); if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); mptcp_pm_nl_add_addr_received(msk); } if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); mptcp_pm_nl_addr_send_ack(msk); } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); mptcp_pm_nl_rm_addr_received(msk); } if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); mptcp_pm_nl_fully_established(msk); } if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); mptcp_pm_nl_subflow_established(msk); } spin_unlock_bh(&msk->pm.lock); } static bool address_use_port(struct mptcp_pm_addr_entry *entry) { return (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == MPTCP_PM_ADDR_FLAG_SIGNAL; } /* caller must ensure the RCU grace period is already elapsed */ static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) { if (entry->lsk) sock_release(entry->lsk); kfree(entry); } static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry, bool needs_id) { struct mptcp_pm_addr_entry *cur, *del_entry = NULL; unsigned int addr_max; int ret = -EINVAL; spin_lock_bh(&pernet->lock); /* to keep the code simple, don't do IDR-like allocation for address ID, * just bail when we exceed limits */ if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) pernet->next_id = 1; if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { ret = -ERANGE; goto out; } if (test_bit(entry->addr.id, pernet->id_bitmap)) { ret = -EBUSY; goto out; } /* do not insert duplicate address, differentiate on port only * singled addresses */ if (!address_use_port(entry)) entry->addr.port = 0; list_for_each_entry(cur, &pernet->local_addr_list, list) { if (mptcp_addresses_equal(&cur->addr, &entry->addr, cur->addr.port || entry->addr.port)) { /* allow replacing the exiting endpoint only if such * endpoint is an implicit one and the user-space * did not provide an endpoint id */ if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) { ret = -EEXIST; goto out; } if (entry->addr.id) goto out; pernet->addrs--; entry->addr.id = cur->addr.id; list_del_rcu(&cur->list); del_entry = cur; break; } } if (!entry->addr.id && needs_id) { find_next: entry->addr.id = find_next_zero_bit(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, pernet->next_id); if (!entry->addr.id && pernet->next_id != 1) { pernet->next_id = 1; goto find_next; } } if (!entry->addr.id && needs_id) goto out; __set_bit(entry->addr.id, pernet->id_bitmap); if (entry->addr.id > pernet->next_id) pernet->next_id = entry->addr.id; if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { addr_max = pernet->add_addr_signal_max; WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; WRITE_ONCE(pernet->local_addr_max, addr_max + 1); } pernet->addrs++; if (!entry->addr.port) list_add_tail_rcu(&entry->list, &pernet->local_addr_list); else list_add_rcu(&entry->list, &pernet->local_addr_list); ret = entry->addr.id; out: spin_unlock_bh(&pernet->lock); /* just replaced an existing entry, free it */ if (del_entry) { synchronize_rcu(); __mptcp_pm_release_addr_entry(del_entry); } return ret; } static struct lock_class_key mptcp_slock_keys[2]; static struct lock_class_key mptcp_keys[2]; static int mptcp_pm_nl_create_listen_socket(struct sock *sk, struct mptcp_pm_addr_entry *entry) { bool is_ipv6 = sk->sk_family == AF_INET6; int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; struct sock *newsk, *ssk; int backlog = 1024; int err; err = sock_create_kern(sock_net(sk), entry->addr.family, SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk); if (err) return err; newsk = entry->lsk->sk; if (!newsk) return -EINVAL; /* The subflow socket lock is acquired in a nested to the msk one * in several places, even by the TCP stack, and this msk is a kernel * socket: lockdep complains. Instead of propagating the _nested * modifiers in several places, re-init the lock class for the msk * socket to an mptcp specific one. */ sock_lock_init_class_and_name(newsk, is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET", &mptcp_slock_keys[is_ipv6], is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET", &mptcp_keys[is_ipv6]); lock_sock(newsk); ssk = __mptcp_nmpc_sk(mptcp_sk(newsk)); release_sock(newsk); if (IS_ERR(ssk)) return PTR_ERR(ssk); mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (entry->addr.family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif if (ssk->sk_family == AF_INET) err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ssk->sk_family == AF_INET6) err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); #endif if (err) return err; /* We don't use mptcp_set_state() here because it needs to be called * under the msk socket lock. For the moment, that will not bring * anything more than only calling inet_sk_state_store(), because the * old status is known (TCP_CLOSE). */ inet_sk_state_store(newsk, TCP_LISTEN); lock_sock(ssk); WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true); err = __inet_listen_sk(ssk, backlog); if (!err) mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); release_sock(ssk); return err; } int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; int ret; pernet = pm_nl_get_pernet_from_msk(msk); rcu_read_lock(); entry = __lookup_addr(pernet, skc); ret = entry ? entry->addr.id : -1; rcu_read_unlock(); if (ret >= 0) return ret; /* address not found, add to local list */ entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->addr = *skc; entry->addr.id = 0; entry->addr.port = 0; entry->ifindex = 0; entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; entry->lsk = NULL; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true); if (ret < 0) kfree(entry); return ret; } bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); struct mptcp_pm_addr_entry *entry; bool backup; rcu_read_lock(); entry = __lookup_addr(pernet, skc); backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); rcu_read_unlock(); return backup; } #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 static const struct genl_multicast_group mptcp_pm_mcgrps[] = { [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, }, [MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME, .flags = GENL_MCAST_CAP_NET_ADMIN, }, }; void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; unsigned int active_max_loss_cnt; struct net *net = sock_net(sk); unsigned int stale_loss_cnt; bool slow; stale_loss_cnt = mptcp_stale_loss_cnt(net); if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) return; /* look for another available subflow not in loss state */ active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); mptcp_for_each_subflow(msk, iter) { if (iter != subflow && mptcp_subflow_active(iter) && iter->stale_count < active_max_loss_cnt) { /* we have some alternatives, try to mark this subflow as idle ...*/ slow = lock_sock_fast(ssk); if (!tcp_rtx_and_write_queues_empty(ssk)) { subflow->stale = 1; __mptcp_retransmit_pending_data(sk); MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); } unlock_sock_fast(ssk, slow); /* always try to push the pending data regardless of re-injections: * we can possibly use backup subflows now, and subflow selection * is cheap under the msk socket lock */ __mptcp_push_pending(sk, 0); return; } } } static int mptcp_pm_family_to_addr(int family) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (family == AF_INET6) return MPTCP_PM_ADDR_ATTR_ADDR6; #endif return MPTCP_PM_ADDR_ATTR_ADDR4; } static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], const struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr, bool require_family) { int err, addr_addr; if (!attr) { GENL_SET_ERR_MSG(info, "missing address info"); return -EINVAL; } /* no validation needed - was already done via nested policy */ err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, mptcp_pm_address_nl_policy, info->extack); if (err) return err; if (tb[MPTCP_PM_ADDR_ATTR_ID]) addr->id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { if (!require_family) return 0; NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing family"); return -EINVAL; } addr->family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); if (addr->family != AF_INET #if IS_ENABLED(CONFIG_MPTCP_IPV6) && addr->family != AF_INET6 #endif ) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "unknown address family"); return -EINVAL; } addr_addr = mptcp_pm_family_to_addr(addr->family); if (!tb[addr_addr]) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing address data"); return -EINVAL; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr->family == AF_INET6) addr->addr6 = nla_get_in6_addr(tb[addr_addr]); else #endif addr->addr.s_addr = nla_get_in_addr(tb[addr_addr]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); return 0; } int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; memset(addr, 0, sizeof(*addr)); return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true); } int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, bool require_family, struct mptcp_pm_addr_entry *entry) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; int err; memset(entry, 0, sizeof(*entry)); err = mptcp_pm_parse_pm_addr_attr(tb, attr, info, &entry->addr, require_family); if (err) return err; if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); entry->ifindex = val; } if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); return 0; } static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) { return pm_nl_get_pernet(genl_info_net(info)); } static int mptcp_nl_add_subflow_or_signal_addr(struct net *net, struct mptcp_addr_info *addr) { struct mptcp_sock *msk; long s_slot = 0, s_num = 0; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; struct mptcp_addr_info mpc_addr; if (!READ_ONCE(msk->fully_established) || mptcp_pm_is_userspace(msk)) goto next; /* if the endp linked to the init sf is re-added with a != ID */ mptcp_local_address((struct sock_common *)msk, &mpc_addr); lock_sock(sk); spin_lock_bh(&msk->pm.lock); if (mptcp_addresses_equal(addr, &mpc_addr, addr->port)) msk->mpc_endpoint_id = addr->id; mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); release_sock(sk); next: sock_put(sk); cond_resched(); } return 0; } static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, struct genl_info *info) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, mptcp_pm_address_nl_policy, info->extack) && tb[MPTCP_PM_ADDR_ATTR_ID]) return true; return false; } int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; int ret; ret = mptcp_pm_parse_entry(attr, info, true, &addr); if (ret < 0) return ret; if (addr.addr.port && !address_use_port(&addr)) { GENL_SET_ERR_MSG(info, "flags must have signal and not subflow when using port"); return -EINVAL; } if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh"); return -EINVAL; } if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint"); return -EINVAL; } entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT); if (!entry) { GENL_SET_ERR_MSG(info, "can't allocate addr"); return -ENOMEM; } *entry = addr; if (entry->addr.port) { ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); if (ret) { GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret); goto out_free; } } ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, !mptcp_pm_has_addr_attr_id(attr, info)); if (ret < 0) { GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); goto out_free; } mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr); return 0; out_free: __mptcp_pm_release_addr_entry(entry); return ret; } static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; entry = mptcp_pm_del_add_timer(msk, addr, false); if (entry) { kfree(entry); return true; } return false; } static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; } static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool force) { struct mptcp_rm_list list = { .nr = 0 }; bool ret; list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); ret = remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); if (ret) { __set_bit(addr->id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled--; } mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } return ret; } static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) { /* If it was marked as used, and not ID 0, decrement local_addr_used */ if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) && id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0)) msk->pm.local_addr_used--; } static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, const struct mptcp_pm_addr_entry *entry) { const struct mptcp_addr_info *addr = &entry->addr; struct mptcp_rm_list list = { .nr = 1 }; long s_slot = 0, s_num = 0; struct mptcp_sock *msk; pr_debug("remove_id=%d\n", addr->id); while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; bool remove_subflow; if (mptcp_pm_is_userspace(msk)) goto next; if (list_empty(&msk->conn_list)) { mptcp_pm_remove_anno_addr(msk, addr, false); goto next; } lock_sock(sk); remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); list.ids[0] = mptcp_endp_get_local_id(msk, addr); if (remove_subflow) { spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, &list); spin_unlock_bh(&msk->pm.lock); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { spin_lock_bh(&msk->pm.lock); __mark_subflow_endp_available(msk, list.ids[0]); spin_unlock_bh(&msk->pm.lock); } if (msk->mpc_endpoint_id == entry->addr.id) msk->mpc_endpoint_id = 0; release_sock(sk); next: sock_put(sk); cond_resched(); } return 0; } static int mptcp_nl_remove_id_zero_address(struct net *net, struct mptcp_addr_info *addr) { struct mptcp_rm_list list = { .nr = 0 }; long s_slot = 0, s_num = 0; struct mptcp_sock *msk; list.ids[list.nr++] = 0; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; struct mptcp_addr_info msk_local; if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) goto next; mptcp_local_address((struct sock_common *)msk, &msk_local); if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) goto next; lock_sock(sk); spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &list); mptcp_pm_nl_rm_subflow_received(msk, &list); __mark_subflow_endp_available(msk, 0); spin_unlock_bh(&msk->pm.lock); release_sock(sk); next: sock_put(sk); cond_resched(); } return 0; } int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; unsigned int addr_max; int ret; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; /* the zero id address is special: the first address used by the msk * always gets such an id, so different subflows can have different zero * id addresses. Additionally zero id is not accounted for in id_bitmap. * Let's use an 'mptcp_rm_list' instead of the common remove code. */ if (addr.addr.id == 0) return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr); spin_lock_bh(&pernet->lock); entry = __lookup_addr_by_id(pernet, addr.addr.id); if (!entry) { GENL_SET_ERR_MSG(info, "address not found"); spin_unlock_bh(&pernet->lock); return -EINVAL; } if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { addr_max = pernet->add_addr_signal_max; WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; WRITE_ONCE(pernet->local_addr_max, addr_max - 1); } pernet->addrs--; list_del_rcu(&entry->list); __clear_bit(entry->addr.id, pernet->id_bitmap); spin_unlock_bh(&pernet->lock); mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); synchronize_rcu(); __mptcp_pm_release_addr_entry(entry); return ret; } /* Called from the userspace PM only */ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; int anno_nr = 0; list_for_each_entry(entry, rm_list, list) { if (alist.nr >= MPTCP_RM_IDS_MAX) break; /* only delete if either announced or matching a subflow */ if (remove_anno_list_by_saddr(msk, &entry->addr)) anno_nr++; else if (!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) continue; alist.ids[alist.nr++] = entry->addr.id; } if (alist.nr) { spin_lock_bh(&msk->pm.lock); msk->pm.add_addr_signaled -= anno_nr; mptcp_pm_remove_addr(msk, &alist); spin_unlock_bh(&msk->pm.lock); } } /* Called from the in-kernel PM only */ static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, rm_list, list) { if (slist.nr < MPTCP_RM_IDS_MAX && lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); if (alist.nr < MPTCP_RM_IDS_MAX && remove_anno_list_by_saddr(msk, &entry->addr)) alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); } spin_lock_bh(&msk->pm.lock); if (alist.nr) { msk->pm.add_addr_signaled -= alist.nr; mptcp_pm_remove_addr(msk, &alist); } if (slist.nr) mptcp_pm_nl_rm_subflow_received(msk, &slist); /* Reset counters: maybe some subflows have been removed before */ bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); msk->pm.local_addr_used = 0; spin_unlock_bh(&msk->pm.lock); } static void mptcp_nl_flush_addrs_list(struct net *net, struct list_head *rm_list) { long s_slot = 0, s_num = 0; struct mptcp_sock *msk; if (list_empty(rm_list)) return; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { lock_sock(sk); mptcp_pm_flush_addrs_and_subflows(msk, rm_list); release_sock(sk); } sock_put(sk); cond_resched(); } } /* caller must ensure the RCU grace period is already elapsed */ static void __flush_addrs(struct list_head *list) { while (!list_empty(list)) { struct mptcp_pm_addr_entry *cur; cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); list_del_rcu(&cur->list); __mptcp_pm_release_addr_entry(cur); } } static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->add_addr_signal_max, 0); WRITE_ONCE(pernet->add_addr_accept_max, 0); WRITE_ONCE(pernet->local_addr_max, 0); pernet->addrs = 0; } int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); list_splice_init(&pernet->local_addr_list, &free_list); __reset_counters(pernet); pernet->next_id = 1; bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list); synchronize_rcu(); __flush_addrs(&free_list); return 0; } int mptcp_nl_fill_addr(struct sk_buff *skb, struct mptcp_pm_addr_entry *entry) { struct mptcp_addr_info *addr = &entry->addr; struct nlattr *attr; attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR); if (!attr) return -EMSGSIZE; if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family)) goto nla_put_failure; if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_PORT, ntohs(addr->port))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags)) goto nla_put_failure; if (entry->ifindex && nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex)) goto nla_put_failure; if (addr->family == AF_INET && nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4, addr->addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6 && nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6)) goto nla_put_failure; #endif nla_nest_end(skb, attr); return 0; nla_put_failure: nla_nest_cancel(skb, attr); return -EMSGSIZE; } int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; struct sk_buff *msg; void *reply; int ret; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, info->genlhdr->cmd); if (!reply) { GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); ret = -EMSGSIZE; goto fail; } rcu_read_lock(); entry = __lookup_addr_by_id(pernet, addr.addr.id); if (!entry) { GENL_SET_ERR_MSG(info, "address not found"); ret = -EINVAL; goto unlock_fail; } ret = mptcp_nl_fill_addr(msg, entry); if (ret) goto unlock_fail; genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); rcu_read_unlock(); return ret; unlock_fail: rcu_read_unlock(); fail: nlmsg_free(msg); return ret; } int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) { return mptcp_pm_get_addr(skb, info); } int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) { struct net *net = sock_net(msg->sk); struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; int id = cb->args[0]; void *hdr; int i; pernet = pm_nl_get_pernet(net); rcu_read_lock(); for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { if (test_bit(i, pernet->id_bitmap)) { entry = __lookup_addr_by_id(pernet, i); if (!entry) break; if (entry->addr.id <= id) continue; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &mptcp_genl_family, NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); if (!hdr) break; if (mptcp_nl_fill_addr(msg, entry) < 0) { genlmsg_cancel(msg, hdr); break; } id = entry->addr.id; genlmsg_end(msg, hdr); } } rcu_read_unlock(); cb->args[0] = id; return msg->len; } int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { return mptcp_pm_dump_addr(msg, cb); } static int parse_limit(struct genl_info *info, int id, unsigned int *limit) { struct nlattr *attr = info->attrs[id]; if (!attr) return 0; *limit = nla_get_u32(attr); if (*limit > MPTCP_PM_ADDR_MAX) { GENL_SET_ERR_MSG(info, "limit greater than maximum"); return -EINVAL; } return 0; } int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); unsigned int rcv_addrs, subflows; int ret; spin_lock_bh(&pernet->lock); rcv_addrs = pernet->add_addr_accept_max; ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); if (ret) goto unlock; subflows = pernet->subflows_max; ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); if (ret) goto unlock; WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); WRITE_ONCE(pernet->subflows_max, subflows); unlock: spin_unlock_bh(&pernet->lock); return ret; } int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct sk_buff *msg; void *reply; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, MPTCP_PM_CMD_GET_LIMITS); if (!reply) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, READ_ONCE(pernet->add_addr_accept_max))) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, READ_ONCE(pernet->subflows_max))) goto fail; genlmsg_end(msg, reply); return genlmsg_reply(msg, info); fail: GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); nlmsg_free(msg); return -EMSGSIZE; } static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, struct mptcp_addr_info *addr) { struct mptcp_rm_list list = { .nr = 0 }; list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, &list); __mark_subflow_endp_available(msk, list.ids[0]); mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); } static int mptcp_nl_set_flags(struct net *net, struct mptcp_addr_info *addr, u8 bkup, u8 changed) { long s_slot = 0, s_num = 0; struct mptcp_sock *msk; int ret = -EINVAL; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) goto next; lock_sock(sk); if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) mptcp_pm_nl_fullmesh(msk, addr); release_sock(sk); next: sock_put(sk); cond_resched(); } return ret; } int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) { struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | MPTCP_PM_ADDR_FLAG_FULLMESH; struct net *net = sock_net(skb->sk); struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; u8 lookup_by_id = 0; u8 bkup = 0; int ret; pernet = pm_nl_get_pernet(net); ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; if (addr.addr.family == AF_UNSPEC) { lookup_by_id = 1; if (!addr.addr.id) { GENL_SET_ERR_MSG(info, "missing required inputs"); return -EOPNOTSUPP; } } if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; spin_lock_bh(&pernet->lock); entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) : __lookup_addr(pernet, &addr.addr); if (!entry) { spin_unlock_bh(&pernet->lock); GENL_SET_ERR_MSG(info, "address not found"); return -EINVAL; } if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { spin_unlock_bh(&pernet->lock); GENL_SET_ERR_MSG(info, "invalid addr flags"); return -EINVAL; } changed = (addr.flags ^ entry->flags) & mask; entry->flags = (entry->flags & ~mask) | (addr.flags & mask); addr = *entry; spin_unlock_bh(&pernet->lock); mptcp_nl_set_flags(net, &addr.addr, bkup, changed); return 0; } int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) { return mptcp_pm_set_flags(skb, info); } static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) { genlmsg_multicast_netns(&mptcp_genl_family, net, nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp); } bool mptcp_userspace_pm_active(const struct mptcp_sock *msk) { return genl_has_listeners(&mptcp_genl_family, sock_net((const struct sock *)msk), MPTCP_PM_EV_GRP_OFFSET); } static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) { const struct inet_sock *issk = inet_sk(ssk); const struct mptcp_subflow_context *sf; if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) return -EMSGSIZE; switch (ssk->sk_family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) return -EMSGSIZE; if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, issk->inet_daddr)) return -EMSGSIZE; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { const struct ipv6_pinfo *np = inet6_sk(ssk); if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) return -EMSGSIZE; if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr)) return -EMSGSIZE; break; } #endif default: WARN_ON_ONCE(1); return -EMSGSIZE; } if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) return -EMSGSIZE; if (nla_put_be16(skb, MPTCP_ATTR_DPORT, issk->inet_dport)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (WARN_ON_ONCE(!sf)) return -EINVAL; if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, subflow_get_local_id(sf))) return -EMSGSIZE; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id)) return -EMSGSIZE; return 0; } static int mptcp_event_put_token_and_ssk(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { const struct sock *sk = (const struct sock *)msk; const struct mptcp_subflow_context *sf; u8 sk_err; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) return -EMSGSIZE; if (mptcp_event_add_subflow(skb, ssk)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (WARN_ON_ONCE(!sf)) return -EINVAL; if (nla_put_u8(skb, MPTCP_ATTR_BACKUP, sf->backup)) return -EMSGSIZE; if (ssk->sk_bound_dev_if && nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if)) return -EMSGSIZE; sk_err = READ_ONCE(ssk->sk_err); if (sk_err && sk->sk_state == TCP_ESTABLISHED && nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err)) return -EMSGSIZE; return 0; } static int mptcp_event_sub_established(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { return mptcp_event_put_token_and_ssk(skb, msk, ssk); } static int mptcp_event_sub_closed(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { const struct mptcp_subflow_context *sf; if (mptcp_event_put_token_and_ssk(skb, msk, ssk)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (!sf->reset_seen) return 0; if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason)) return -EMSGSIZE; if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient)) return -EMSGSIZE; return 0; } static int mptcp_event_created(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); if (err) return err; if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) return -EMSGSIZE; return mptcp_event_add_subflow(skb, ssk); } void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id) { struct net *net = sock_net((const struct sock *)msk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_REMOVED); if (!nlh) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id)) goto nla_put_failure; genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_ANNOUNCED); if (!nlh) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id)) goto nla_put_failure; if (nla_put_be16(skb, MPTCP_ATTR_DPORT, info->port == 0 ? inet_sk(ssk)->inet_dport : info->port)) goto nla_put_failure; switch (info->family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, info->addr.s_addr)) goto nla_put_failure; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &info->addr6)) goto nla_put_failure; break; #endif default: WARN_ON_ONCE(1); goto nla_put_failure; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event) { const struct inet_sock *issk = inet_sk(ssk); struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, event); if (!nlh) goto nla_put_failure; if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) goto nla_put_failure; if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) goto nla_put_failure; switch (ssk->sk_family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) goto nla_put_failure; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { const struct ipv6_pinfo *np = inet6_sk(ssk); if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) goto nla_put_failure; break; } #endif default: WARN_ON_ONCE(1); goto nla_put_failure; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_KERNEL); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) { struct net *net = sock_net((const struct sock *)msk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, type); if (!nlh) goto nla_put_failure; switch (type) { case MPTCP_EVENT_UNSPEC: WARN_ON_ONCE(1); break; case MPTCP_EVENT_CREATED: case MPTCP_EVENT_ESTABLISHED: if (mptcp_event_created(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_CLOSED: if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)) < 0) goto nla_put_failure; break; case MPTCP_EVENT_ANNOUNCED: case MPTCP_EVENT_REMOVED: /* call mptcp_event_addr_announced()/removed instead */ WARN_ON_ONCE(1); break; case MPTCP_EVENT_SUB_ESTABLISHED: case MPTCP_EVENT_SUB_PRIORITY: if (mptcp_event_sub_established(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_SUB_CLOSED: if (mptcp_event_sub_closed(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_LISTENER_CREATED: case MPTCP_EVENT_LISTENER_CLOSED: break; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, gfp); return; nla_put_failure: nlmsg_free(skb); } struct genl_family mptcp_genl_family __ro_after_init = { .name = MPTCP_PM_NAME, .version = MPTCP_PM_VER, .netnsok = true, .module = THIS_MODULE, .ops = mptcp_pm_nl_ops, .n_ops = ARRAY_SIZE(mptcp_pm_nl_ops), .resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1, .mcgrps = mptcp_pm_mcgrps, .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), }; static int __net_init pm_nl_init_net(struct net *net) { struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); /* Cit. 2 subflows ought to be enough for anybody. */ pernet->subflows_max = 2; pernet->next_id = 1; pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); /* No need to initialize other pernet fields, the struct is zeroed at * allocation time. */ return 0; } static void __net_exit pm_nl_exit_net(struct list_head *net_list) { struct net *net; list_for_each_entry(net, net_list, exit_list) { struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); /* net is removed from namespace list, can't race with * other modifiers, also netns core already waited for a * RCU grace period. */ __flush_addrs(&pernet->local_addr_list); } } static struct pernet_operations mptcp_pm_pernet_ops = { .init = pm_nl_init_net, .exit_batch = pm_nl_exit_net, .id = &pm_nl_pernet_id, .size = sizeof(struct pm_nl_pernet), }; void __init mptcp_pm_nl_init(void) { if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) panic("Failed to register MPTCP PM pernet subsystem.\n"); if (genl_register_family(&mptcp_genl_family)) panic("Failed to register MPTCP PM netlink family\n"); }
292 1 294 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2017 Intel Corporation. All rights reserved. * * This code is based in part on work published here: * * https://github.com/IAIK/KAISER * * The original work was written by and signed off by for the Linux * kernel by: * * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> * * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and * Andy Lutomirsky <luto@amacapital.net> */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/cpu.h> #include <asm/cpufeature.h> #include <asm/hypervisor.h> #include <asm/vsyscall.h> #include <asm/cmdline.h> #include <asm/pti.h> #include <asm/tlbflush.h> #include <asm/desc.h> #include <asm/sections.h> #include <asm/set_memory.h> #undef pr_fmt #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt /* Backporting helper */ #ifndef __GFP_NOTRACK #define __GFP_NOTRACK 0 #endif /* * Define the page-table levels we clone for user-space on 32 * and 64 bit. */ #ifdef CONFIG_X86_64 #define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD #else #define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE #endif static void __init pti_print_if_insecure(const char *reason) { if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } static void __init pti_print_if_secure(const char *reason) { if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } /* Assume mode is auto unless overridden via cmdline below. */ static enum pti_mode { PTI_AUTO = 0, PTI_FORCE_OFF, PTI_FORCE_ON } pti_mode; void __init pti_check_boottime_disable(void) { if (hypervisor_is_type(X86_HYPER_XEN_PV)) { pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on XEN PV."); return; } if (cpu_mitigations_off()) pti_mode = PTI_FORCE_OFF; if (pti_mode == PTI_FORCE_OFF) { pti_print_if_insecure("disabled on command line."); return; } if (pti_mode == PTI_FORCE_ON) pti_print_if_secure("force enabled on command line."); if (pti_mode == PTI_AUTO && !boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) return; setup_force_cpu_cap(X86_FEATURE_PTI); } static int __init pti_parse_cmdline(char *arg) { if (!strcmp(arg, "off")) pti_mode = PTI_FORCE_OFF; else if (!strcmp(arg, "on")) pti_mode = PTI_FORCE_ON; else if (!strcmp(arg, "auto")) pti_mode = PTI_AUTO; else return -EINVAL; return 0; } early_param("pti", pti_parse_cmdline); static int __init pti_parse_cmdline_nopti(char *arg) { pti_mode = PTI_FORCE_OFF; return 0; } early_param("nopti", pti_parse_cmdline_nopti); pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { /* * Changes to the high (kernel) portion of the kernelmode page * tables are not automatically propagated to the usermode tables. * * Users should keep in mind that, unlike the kernelmode tables, * there is no vmalloc_fault equivalent for the usermode tables. * Top-level entries added to init_mm's usermode pgd after boot * will not be automatically propagated to other mms. */ if (!pgdp_maps_userspace(pgdp) || (pgd.pgd & _PAGE_NOPTISHADOW)) return pgd; /* * The user page tables get the full PGD, accessible from * userspace: */ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; /* * If this is normal user memory, make it NX in the kernel * pagetables so that, if we somehow screw up and return to * usermode with the kernel CR3 loaded, we'll get a page fault * instead of allowing user code to execute with the wrong CR3. * * As exceptions, we don't set NX if: * - _PAGE_USER is not set. This could be an executable * EFI runtime mapping or something similar, and the kernel * may execute from it * - we don't have NX support * - we're clearing the PGD (i.e. the new pgd is not present). */ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && (__supported_pte_mask & _PAGE_NX)) pgd.pgd |= _PAGE_NX; /* return the copy of the PGD we want the kernel to use: */ return pgd; } /* * Walk the user copy of the page tables (optionally) trying to allocate * page table pages on the way down. * * Returns a pointer to a P4D on success, or NULL on failure. */ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) { pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); if (address < PAGE_OFFSET) { WARN_ONCE(1, "attempt to walk user address\n"); return NULL; } if (pgd_none(*pgd)) { unsigned long new_p4d_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_p4d_page)) return NULL; set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } BUILD_BUG_ON(pgd_leaf(*pgd) != 0); return p4d_offset(pgd, address); } /* * Walk the user copy of the page tables (optionally) trying to allocate * page table pages on the way down. * * Returns a pointer to a PMD on success, or NULL on failure. */ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); p4d_t *p4d; pud_t *pud; p4d = pti_user_pagetable_walk_p4d(address); if (!p4d) return NULL; BUILD_BUG_ON(p4d_leaf(*p4d) != 0); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pud_page)) return NULL; set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); } pud = pud_offset(p4d, address); /* The user page tables do not use large mappings: */ if (pud_leaf(*pud)) { WARN_ON(1); return NULL; } if (pud_none(*pud)) { unsigned long new_pmd_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pmd_page)) return NULL; set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); } return pmd_offset(pud, address); } /* * Walk the shadow copy of the page tables (optionally) trying to allocate * page table pages on the way down. Does not support large pages. * * Note: this is only used when mapping *new* kernel data into the * user/shadow page tables. It is never used for userspace data. * * Returns a pointer to a PTE on success, or NULL on failure. */ static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); pmd_t *pmd; pte_t *pte; pmd = pti_user_pagetable_walk_pmd(address); if (!pmd) return NULL; /* Large PMD mapping found */ if (pmd_leaf(*pmd)) { /* Clear the PMD if we hit a large mapping from the first round */ if (late_text) { set_pmd(pmd, __pmd(0)); } else { WARN_ON_ONCE(1); return NULL; } } if (pmd_none(*pmd)) { unsigned long new_pte_page = __get_free_page(gfp); if (!new_pte_page) return NULL; set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); } pte = pte_offset_kernel(pmd, address); if (pte_flags(*pte) & _PAGE_USER) { WARN_ONCE(1, "attempt to walk to user pte\n"); return NULL; } return pte; } #ifdef CONFIG_X86_VSYSCALL_EMULATION static void __init pti_setup_vsyscall(void) { pte_t *pte, *target_pte; unsigned int level; pte = lookup_address(VSYSCALL_ADDR, &level); if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) return; target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false); if (WARN_ON(!target_pte)) return; *target_pte = *pte; set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); } #else static void __init pti_setup_vsyscall(void) { } #endif enum pti_clone_level { PTI_CLONE_PMD, PTI_CLONE_PTE, }; static void pti_clone_pgtable(unsigned long start, unsigned long end, enum pti_clone_level level, bool late_text) { unsigned long addr; /* * Clone the populated PMDs which cover start to end. These PMD areas * can have holes. */ for (addr = start; addr < end;) { pte_t *pte, *target_pte; pmd_t *pmd, *target_pmd; pgd_t *pgd; p4d_t *p4d; pud_t *pud; /* Overflow check */ if (addr < start) break; pgd = pgd_offset_k(addr); if (WARN_ON(pgd_none(*pgd))) return; p4d = p4d_offset(pgd, addr); if (WARN_ON(p4d_none(*p4d))) return; pud = pud_offset(p4d, addr); if (pud_none(*pud)) { WARN_ON_ONCE(addr & ~PUD_MASK); addr = round_up(addr + 1, PUD_SIZE); continue; } pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { WARN_ON_ONCE(addr & ~PMD_MASK); addr = round_up(addr + 1, PMD_SIZE); continue; } if (pmd_leaf(*pmd) || level == PTI_CLONE_PMD) { target_pmd = pti_user_pagetable_walk_pmd(addr); if (WARN_ON(!target_pmd)) return; /* * Only clone present PMDs. This ensures only setting * _PAGE_GLOBAL on present PMDs. This should only be * called on well-known addresses anyway, so a non- * present PMD would be a surprise. */ if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) return; /* * Setting 'target_pmd' below creates a mapping in both * the user and kernel page tables. It is effectively * global, so set it as global in both copies. Note: * the X86_FEATURE_PGE check is not _required_ because * the CPU ignores _PAGE_GLOBAL when PGE is not * supported. The check keeps consistency with * code that only set this bit when supported. */ if (boot_cpu_has(X86_FEATURE_PGE)) *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); /* * Copy the PMD. That is, the kernelmode and usermode * tables will share the last-level page tables of this * address range */ *target_pmd = *pmd; addr = round_up(addr + 1, PMD_SIZE); } else if (level == PTI_CLONE_PTE) { /* Walk the page-table down to the pte level */ pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { addr = round_up(addr + 1, PAGE_SIZE); continue; } /* Only clone present PTEs */ if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT))) return; /* Allocate PTE in the user page-table */ target_pte = pti_user_pagetable_walk_pte(addr, late_text); if (WARN_ON(!target_pte)) return; /* Set GLOBAL bit in both PTEs */ if (boot_cpu_has(X86_FEATURE_PGE)) *pte = pte_set_flags(*pte, _PAGE_GLOBAL); /* Clone the PTE */ *target_pte = *pte; addr = round_up(addr + 1, PAGE_SIZE); } else { BUG(); } } } #ifdef CONFIG_X86_64 /* * Clone a single p4d (i.e. a top-level entry on 4-level systems and a * next-level entry on 5-level systems. */ static void __init pti_clone_p4d(unsigned long addr) { p4d_t *kernel_p4d, *user_p4d; pgd_t *kernel_pgd; user_p4d = pti_user_pagetable_walk_p4d(addr); if (!user_p4d) return; kernel_pgd = pgd_offset_k(addr); kernel_p4d = p4d_offset(kernel_pgd, addr); *user_p4d = *kernel_p4d; } /* * Clone the CPU_ENTRY_AREA and associated data into the user space visible * page table. */ static void __init pti_clone_user_shared(void) { unsigned int cpu; pti_clone_p4d(CPU_ENTRY_AREA_BASE); for_each_possible_cpu(cpu) { /* * The SYSCALL64 entry code needs one word of scratch space * in which to spill a register. It lives in the sp2 slot * of the CPU's TSS. * * This is done for all possible CPUs during boot to ensure * that it's propagated to all mms. */ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); pte_t *target_pte; target_pte = pti_user_pagetable_walk_pte(va, false); if (WARN_ON(!target_pte)) return; *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL); } } #else /* CONFIG_X86_64 */ /* * On 32 bit PAE systems with 1GB of Kernel address space there is only * one pgd/p4d for the whole kernel. Cloning that would map the whole * address space into the user page-tables, making PTI useless. So clone * the page-table on the PMD level to prevent that. */ static void __init pti_clone_user_shared(void) { unsigned long start, end; start = CPU_ENTRY_AREA_BASE; end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); pti_clone_pgtable(start, end, PTI_CLONE_PMD, false); } #endif /* CONFIG_X86_64 */ /* * Clone the ESPFIX P4D into the user space visible page table */ static void __init pti_setup_espfix64(void) { #ifdef CONFIG_X86_ESPFIX64 pti_clone_p4d(ESPFIX_BASE_ADDR); #endif } /* * Clone the populated PMDs of the entry text and force it RO. */ static void pti_clone_entry_text(bool late) { pti_clone_pgtable((unsigned long) __entry_text_start, (unsigned long) __entry_text_end, PTI_LEVEL_KERNEL_IMAGE, late); } /* * Global pages and PCIDs are both ways to make kernel TLB entries * live longer, reduce TLB misses and improve kernel performance. * But, leaving all kernel text Global makes it potentially accessible * to Meltdown-style attacks which make it trivial to find gadgets or * defeat KASLR. * * Only use global pages when it is really worth it. */ static inline bool pti_kernel_image_global_ok(void) { /* * Systems with PCIDs get little benefit from global * kernel text and are not worth the downsides. */ if (cpu_feature_enabled(X86_FEATURE_PCID)) return false; /* * Only do global kernel image for pti=auto. Do the most * secure thing (not global) if pti=on specified. */ if (pti_mode != PTI_AUTO) return false; /* * K8 may not tolerate the cleared _PAGE_RW on the userspace * global kernel image pages. Do the safe thing (disable * global kernel image). This is unlikely to ever be * noticed because PTI is disabled by default on AMD CPUs. */ if (boot_cpu_has(X86_FEATURE_K8)) return false; /* * RANDSTRUCT derives its hardening benefits from the * attacker's lack of knowledge about the layout of kernel * data structures. Keep the kernel image non-global in * cases where RANDSTRUCT is in use to help keep the layout a * secret. */ if (IS_ENABLED(CONFIG_RANDSTRUCT)) return false; return true; } /* * For some configurations, map all of kernel text into the user page * tables. This reduces TLB misses, especially on non-PCID systems. */ static void pti_clone_kernel_text(void) { /* * rodata is part of the kernel image and is normally * readable on the filesystem or on the web. But, do not * clone the areas past rodata, they might contain secrets. */ unsigned long start = PFN_ALIGN(_text); unsigned long end_clone = (unsigned long)__end_rodata_aligned; unsigned long end_global = PFN_ALIGN((unsigned long)_etext); if (!pti_kernel_image_global_ok()) return; pr_debug("mapping partial kernel image into user address space\n"); /* * Note that this will undo _some_ of the work that * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false); /* * pti_clone_pgtable() will set the global bit in any PMDs * that it clones, but we also need to get any PTEs in * the last level for areas that are not huge-page-aligned. */ /* Set the global bit for normal non-__init kernel text: */ set_memory_global(start, (end_global - start) >> PAGE_SHIFT); } static void pti_set_kernel_image_nonglobal(void) { /* * The identity map is created with PMDs, regardless of the * actual length of the kernel. We need to clear * _PAGE_GLOBAL up to a PMD boundary, not just to the end * of the image. */ unsigned long start = PFN_ALIGN(_text); unsigned long end = ALIGN((unsigned long)_end, PMD_SIZE); /* * This clears _PAGE_GLOBAL from the entire kernel image. * pti_clone_kernel_text() map put _PAGE_GLOBAL back for * areas that are mapped to userspace. */ set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); } /* * Initialize kernel page table isolation */ void __init pti_init(void) { if (!boot_cpu_has(X86_FEATURE_PTI)) return; pr_info("enabled\n"); #ifdef CONFIG_X86_32 /* * We check for X86_FEATURE_PCID here. But the init-code will * clear the feature flag on 32 bit because the feature is not * supported on 32 bit anyway. To print the warning we need to * check with cpuid directly again. */ if (cpuid_ecx(0x1) & BIT(17)) { /* Use printk to work around pr_fmt() */ printk(KERN_WARNING "\n"); printk(KERN_WARNING "************************************************************\n"); printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); printk(KERN_WARNING "** **\n"); printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n"); printk(KERN_WARNING "** Your performance will increase dramatically if you **\n"); printk(KERN_WARNING "** switch to a 64-bit kernel! **\n"); printk(KERN_WARNING "** **\n"); printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); printk(KERN_WARNING "************************************************************\n"); } #endif pti_clone_user_shared(); /* Undo all global bits from the init pagetables in head_64.S: */ pti_set_kernel_image_nonglobal(); /* Replace some of the global bits just for shared entry text: */ /* * This is very early in boot. Device and Late initcalls can do * modprobe before free_initmem() and mark_readonly(). This * pti_clone_entry_text() allows those user-mode-helpers to function, * but notably the text is still RW. */ pti_clone_entry_text(false); pti_setup_espfix64(); pti_setup_vsyscall(); } /* * Finalize the kernel mappings in the userspace page-table. Some of the * mappings for the kernel image might have changed since pti_init() * cloned them. This is because parts of the kernel image have been * mapped RO and/or NX. These changes need to be cloned again to the * userspace page-table. */ void pti_finalize(void) { if (!boot_cpu_has(X86_FEATURE_PTI)) return; /* * This is after free_initmem() (all initcalls are done) and we've done * mark_readonly(). Text is now NX which might've split some PMDs * relative to the early clone. */ pti_clone_entry_text(true); pti_clone_kernel_text(); debug_checkwx_user(); }
110 8 118 108 118 118 48 47 48 48 48 48 48 17 48 5 31 31 31 48 48 48 48 31 17 17 17 48 48 48 48 48 48 48 48 47 48 47 47 48 48 48 48 48 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/attr.c * * Copyright (C) 1991, 1992 Linus Torvalds * changes by Thomas Schoebel-Theuer */ #include <linux/export.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/sched/signal.h> #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> #include <linux/filelock.h> #include <linux/security.h> /** * setattr_should_drop_sgid - determine whether the setgid bit needs to be * removed * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the setgid bit needs to be removed. * We retain backwards compatibility and require setgid bit to be removed * unconditionally if S_IXGRP is set. Otherwise we have the exact same * requirements as setattr_prepare() and setattr_copy(). * * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. */ int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode) { umode_t mode = inode->i_mode; if (!(mode & S_ISGID)) return 0; if (mode & S_IXGRP) return ATTR_KILL_SGID; if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) return ATTR_KILL_SGID; return 0; } EXPORT_SYMBOL(setattr_should_drop_sgid); /** * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to * be dropped * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the set{g,u}id bits need to be removed. * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both * set{g,u}id bits need to be removed the corresponding mask of both flags is * returned. * * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits * to remove, 0 otherwise. */ int setattr_should_drop_suidgid(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; int kill = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; kill |= setattr_should_drop_sgid(idmap, inode); if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; return 0; } EXPORT_SYMBOL(setattr_should_drop_suidgid); /** * chown_ok - verify permissions to chown inode * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsuid: uid to chown @inode to * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ static bool chown_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsuid_t ia_vfsuid) { vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && vfsuid_eq(ia_vfsuid, vfsuid)) return true; if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; } /** * chgrp_ok - verify permissions to chgrp inode * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsgid: gid to chown @inode to * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ static bool chgrp_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t ia_vfsgid) { vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { if (vfsgid_eq(ia_vfsgid, vfsgid)) return true; if (vfsgid_in_group_p(ia_vfsgid)) return true; } if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; } /** * setattr_prepare - check if attribute changes to a dentry are allowed * @idmap: idmap of the mount the inode was found from * @dentry: dentry to check * @attr: attributes to change * * Check if we are allowed to change the attributes contained in @attr * in the given dentry. This includes the normal unix access permission * checks, as well as checks for rlimits and others. The function also clears * SGID bit from mode if user is not allowed to set it. Also file capabilities * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. */ int setattr_prepare(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; /* * First check size constraints. These can't be overriden using * ATTR_FORCE. */ if (ia_valid & ATTR_SIZE) { int error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; } /* If force is set do it anyway. */ if (ia_valid & ATTR_FORCE) goto kill_priv; /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && !chown_ok(idmap, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && !chgrp_ok(idmap, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { vfsgid_t vfsgid; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ia_valid & ATTR_GID) vfsgid = attr->ia_vfsgid; else vfsgid = i_gid_into_vfsgid(idmap, inode); /* Also check the setgid bit! */ if (!in_group_or_capable(idmap, inode, vfsgid)) attr->ia_mode &= ~S_ISGID; } /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { if (!inode_owner_or_capable(idmap, inode)) return -EPERM; } kill_priv: /* User has permission for the change */ if (ia_valid & ATTR_KILL_PRIV) { int error; error = security_inode_killpriv(idmap, dentry); if (error) return error; } return 0; } EXPORT_SYMBOL(setattr_prepare); /** * inode_newsize_ok - may this inode be truncated to a given size * @inode: the inode to be truncated * @offset: the new size to assign to the inode * * inode_newsize_ok must be called with i_mutex held. * * inode_newsize_ok will check filesystem limits and ulimits to check that the * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ * when necessary. Caller must not proceed with inode size change if failure is * returned. @inode must be a file (not directory), with appropriate * permissions to allow truncate (inode_newsize_ok does NOT check these * conditions). * * Return: 0 on success, -ve errno on failure */ int inode_newsize_ok(const struct inode *inode, loff_t offset) { if (offset < 0) return -EINVAL; if (inode->i_size < offset) { unsigned long limit; limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY && offset > limit) goto out_sig; if (offset > inode->i_sb->s_maxbytes) goto out_big; } else { /* * truncation of in-use swapfiles is disallowed - it would * cause subsequent swapout to scribble on the now-freed * blocks. */ if (IS_SWAPFILE(inode)) return -ETXTBSY; } return 0; out_sig: send_sig(SIGXFSZ, current, 0); out_big: return -EFBIG; } EXPORT_SYMBOL(inode_newsize_ok); /** * setattr_copy_mgtime - update timestamps for mgtime inodes * @inode: inode timestamps to be updated * @attr: attrs for the update * * With multigrain timestamps, take more care to prevent races when * updating the ctime. Always update the ctime to the very latest using * the standard mechanism, and use that to populate the atime and mtime * appropriately (unless those are being set to specific values). */ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; struct timespec64 now; if (ia_valid & ATTR_CTIME) { /* * In the case of an update for a write delegation, we must respect * the value in ia_ctime and not use the current time. */ if (ia_valid & ATTR_DELEG) now = inode_set_ctime_deleg(inode, attr->ia_ctime); else now = inode_set_ctime_current(inode); } else { /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */ WARN_ON_ONCE(ia_valid & ATTR_MTIME); now = current_time(inode); } if (ia_valid & ATTR_ATIME_SET) inode_set_atime_to_ts(inode, attr->ia_atime); else if (ia_valid & ATTR_ATIME) inode_set_atime_to_ts(inode, now); if (ia_valid & ATTR_MTIME_SET) inode_set_mtime_to_ts(inode, attr->ia_mtime); else if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, now); } /** * setattr_copy - copy simple metadata updates into the generic inode * @idmap: idmap of the mount the inode was found from * @inode: the inode to be updated * @attr: the new attributes * * setattr_copy must be called with i_mutex held. * * setattr_copy updates the inode's metadata with that specified * in attr on idmapped mounts. Necessary permission checks to determine * whether or not the S_ISGID property needs to be removed are performed with * the correct idmapped mount permission helpers. * Noticeably missing is inode size update, which is more complex * as it requires pagecache updates. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * The inode is not marked as dirty after this operation. The rationale is * that for "simple" filesystems, the struct inode is the inode storage. * The caller is free to mark the inode dirty afterwards if needed. */ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; inode->i_mode = mode; } if (is_mgtime(inode)) return setattr_copy_mgtime(inode, attr); if (ia_valid & ATTR_ATIME) inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, attr->ia_mtime); if (ia_valid & ATTR_CTIME) { if (ia_valid & ATTR_DELEG) inode_set_ctime_deleg(inode, attr->ia_ctime); else inode_set_ctime_to_ts(inode, attr->ia_ctime); } } EXPORT_SYMBOL(setattr_copy); int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid) { int error; if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; } /* * If utimes(2) and friends are called with times == NULL (or both * times are UTIME_NOW), then we need to check for write permission */ if (ia_valid & ATTR_TOUCH) { if (IS_IMMUTABLE(inode)) return -EPERM; if (!inode_owner_or_capable(idmap, inode)) { error = inode_permission(idmap, inode, MAY_WRITE); if (error) return error; } } return 0; } EXPORT_SYMBOL(may_setattr); /** * notify_change - modify attributes of a filesystem object * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated * * The caller must hold the i_mutex on the affected object. * * If notify_change discovers a delegation in need of breaking, * it will return -EWOULDBLOCK and return a reference to the inode in * delegated_inode. The caller should then break the delegation and * retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. Also, passing NULL is fine for callers holding * the file open for write, as there can be no conflicting delegation in * that case. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; int error; struct timespec64 now; unsigned int ia_valid = attr->ia_valid; WARN_ON_ONCE(!inode_is_locked(inode)); error = may_setattr(idmap, inode, ia_valid); if (error) return error; if ((ia_valid & ATTR_MODE)) { /* * Don't allow changing the mode of symlinks: * * (1) The vfs doesn't take the mode of symlinks into account * during permission checking. * (2) This has never worked correctly. Most major filesystems * did return EOPNOTSUPP due to interactions with POSIX ACLs * but did still updated the mode of the symlink. * This inconsistency led system call wrapper providers such * as libc to block changing the mode of symlinks with * EOPNOTSUPP already. * (3) To even do this in the first place one would have to use * specific file descriptors and quite some effort. */ if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; /* Flag setting protected by i_mutex */ if (is_sxid(attr->ia_mode)) inode->i_flags &= ~S_NOSEC; } now = current_time(inode); attr->ia_ctime = now; if (!(ia_valid & ATTR_ATIME_SET)) attr->ia_atime = now; else attr->ia_atime = timestamp_truncate(attr->ia_atime, inode); if (!(ia_valid & ATTR_MTIME_SET)) attr->ia_mtime = now; else attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode); if (ia_valid & ATTR_KILL_PRIV) { error = security_inode_need_killpriv(dentry); if (error < 0) return error; if (error == 0) ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV; } /* * We now pass ATTR_KILL_S*ID to the lower level setattr function so * that the function has the ability to reinterpret a mode change * that's due to these bits. This adds an implicit restriction that * no function will ever call notify_change with both ATTR_MODE and * ATTR_KILL_S*ID set. */ if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) && (ia_valid & ATTR_MODE)) BUG(); if (ia_valid & ATTR_KILL_SUID) { if (mode & S_ISUID) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = (inode->i_mode & ~S_ISUID); } } if (ia_valid & ATTR_KILL_SGID) { if (mode & S_ISGID) { if (!(ia_valid & ATTR_MODE)) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = inode->i_mode; } attr->ia_mode &= ~S_ISGID; } } if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) return 0; /* * Verify that uid/gid changes are valid in the target * namespace of the superblock. */ if (ia_valid & ATTR_UID && !vfsuid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && !vfsgid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsgid)) return -EOVERFLOW; /* Don't allow modifications of files with invalid uids or * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && !vfsuid_valid(i_uid_into_vfsuid(idmap, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; error = security_inode_setattr(idmap, dentry, attr); if (error) return error; /* * If ATTR_DELEG is set, then these attributes are being set on * behalf of the holder of a write delegation. We want to avoid * breaking the delegation in this case. */ if (!(ia_valid & ATTR_DELEG)) { error = try_break_deleg(inode, delegated_inode); if (error) return error; } if (inode->i_op->setattr) error = inode->i_op->setattr(idmap, dentry, attr); else error = simple_setattr(idmap, dentry, attr); if (!error) { fsnotify_change(dentry, ia_valid); security_inode_post_setattr(idmap, dentry, ia_valid); } return error; } EXPORT_SYMBOL(notify_change);
21 1 4 18 20 10 10 3 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 /* * Routines to compress and uncompress tcp packets (for transmission * over low speed serial lines). * * Copyright (c) 1989 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms are permitted * provided that the above copyright notice and this paragraph are * duplicated in all such forms and that any documentation, * advertising materials, and other materials related to such * distribution and use acknowledge that the software was developed * by the University of California, Berkeley. The name of the * University may not be used to endorse or promote products derived * from this software without specific prior written permission. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Van Jacobson (van@helios.ee.lbl.gov), Dec 31, 1989: * - Initial distribution. * * * modified for KA9Q Internet Software Package by * Katie Stevens (dkstevens@ucdavis.edu) * University of California, Davis * Computing Services * - 01-31-90 initial adaptation (from 1.19) * PPP.05 02-15-90 [ks] * PPP.08 05-02-90 [ks] use PPP protocol field to signal compression * PPP.15 09-90 [ks] improve mbuf handling * PPP.16 11-02 [karn] substantially rewritten to use NOS facilities * * - Feb 1991 Bill_Simpson@um.cc.umich.edu * variable number of conversation slots * allow zero or one slots * separate routines * status display * - Jul 1994 Dmitry Gorodchanin * Fixes for memory leaks. * - Oct 1994 Dmitry Gorodchanin * Modularization. * - Jan 1995 Bjorn Ekwall * Use ip_fast_csum from ip.h * - July 1995 Christos A. Polyzols * Spotted bug in tcp option checking * * * This module is a difficult issue. It's clearly inet code but it's also clearly * driver code belonging close to PPP and SLIP */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> #include <net/slhc_vj.h> #ifdef CONFIG_INET /* Entire module is for IP only */ #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/termios.h> #include <linux/in.h> #include <linux/fcntl.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/icmp.h> #include <net/tcp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <net/checksum.h> #include <linux/unaligned.h> static unsigned char *encode(unsigned char *cp, unsigned short n); static long decode(unsigned char **cpp); static unsigned char * put16(unsigned char *cp, unsigned short x); static unsigned short pull16(unsigned char **cpp); /* Allocate compression data structure * slots must be in range 0 to 255 (zero meaning no compression) * Returns pointer to structure or ERR_PTR() on error. */ struct slcompress * slhc_init(int rslots, int tslots) { short i; struct cstate *ts; struct slcompress *comp; if (rslots < 0 || rslots > 255 || tslots < 0 || tslots > 255) return ERR_PTR(-EINVAL); comp = kzalloc(sizeof(struct slcompress), GFP_KERNEL); if (! comp) goto out_fail; if (rslots > 0) { size_t rsize = rslots * sizeof(struct cstate); comp->rstate = kzalloc(rsize, GFP_KERNEL); if (! comp->rstate) goto out_free; comp->rslot_limit = rslots - 1; } if (tslots > 0) { size_t tsize = tslots * sizeof(struct cstate); comp->tstate = kzalloc(tsize, GFP_KERNEL); if (! comp->tstate) goto out_free2; comp->tslot_limit = tslots - 1; } comp->xmit_oldest = 0; comp->xmit_current = 255; comp->recv_current = 255; /* * don't accept any packets with implicit index until we get * one with an explicit index. Otherwise the uncompress code * will try to use connection 255, which is almost certainly * out of range */ comp->flags |= SLF_TOSS; if ( tslots > 0 ) { ts = comp->tstate; for(i = comp->tslot_limit; i > 0; --i){ ts[i].cs_this = i; ts[i].next = &(ts[i - 1]); } ts[0].next = &(ts[comp->tslot_limit]); ts[0].cs_this = 0; } return comp; out_free2: kfree(comp->rstate); out_free: kfree(comp); out_fail: return ERR_PTR(-ENOMEM); } /* Free a compression data structure */ void slhc_free(struct slcompress *comp) { if ( IS_ERR_OR_NULL(comp) ) return; if ( comp->tstate != NULLSLSTATE ) kfree( comp->tstate ); if ( comp->rstate != NULLSLSTATE ) kfree( comp->rstate ); kfree( comp ); } /* Put a short in host order into a char array in network order */ static inline unsigned char * put16(unsigned char *cp, unsigned short x) { *cp++ = x >> 8; *cp++ = x; return cp; } /* Encode a number */ static unsigned char * encode(unsigned char *cp, unsigned short n) { if(n >= 256 || n == 0){ *cp++ = 0; cp = put16(cp,n); } else { *cp++ = n; } return cp; } /* Pull a 16-bit integer in host order from buffer in network byte order */ static unsigned short pull16(unsigned char **cpp) { short rval; rval = *(*cpp)++; rval <<= 8; rval |= *(*cpp)++; return rval; } /* Decode a number */ static long decode(unsigned char **cpp) { int x; x = *(*cpp)++; if(x == 0){ return pull16(cpp) & 0xffff; /* pull16 returns -1 on error */ } else { return x & 0xff; /* -1 if PULLCHAR returned error */ } } /* * icp and isize are the original packet. * ocp is a place to put a copy if necessary. * cpp is initially a pointer to icp. If the copy is used, * change it to ocp. */ int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize, unsigned char *ocp, unsigned char **cpp, int compress_cid) { struct cstate *ocs = &(comp->tstate[comp->xmit_oldest]); struct cstate *lcs = ocs; struct cstate *cs = lcs->next; unsigned long deltaS, deltaA; short changes = 0; int nlen, hlen; unsigned char new_seq[16]; unsigned char *cp = new_seq; struct iphdr *ip; struct tcphdr *th, *oth; __sum16 csum; /* * Don't play with runt packets. */ if(isize<sizeof(struct iphdr)) return isize; ip = (struct iphdr *) icp; if (ip->version != 4 || ip->ihl < 5) return isize; /* Bail if this packet isn't TCP, or is an IP fragment */ if (ip->protocol != IPPROTO_TCP || (ntohs(ip->frag_off) & 0x3fff)) { /* Send as regular IP */ if(ip->protocol != IPPROTO_TCP) comp->sls_o_nontcp++; else comp->sls_o_tcp++; return isize; } nlen = ip->ihl * 4; if (isize < nlen + sizeof(*th)) return isize; th = (struct tcphdr *)(icp + nlen); if (th->doff < sizeof(struct tcphdr) / 4) return isize; hlen = nlen + th->doff * 4; /* Bail if the TCP packet isn't `compressible' (i.e., ACK isn't set or * some other control bit is set). Also uncompressible if * it's a runt. */ if(hlen > isize || th->syn || th->fin || th->rst || ! (th->ack)){ /* TCP connection stuff; send as regular IP */ comp->sls_o_tcp++; return isize; } /* * Packet is compressible -- we're going to send either a * COMPRESSED_TCP or UNCOMPRESSED_TCP packet. Either way, * we need to locate (or create) the connection state. * * States are kept in a circularly linked list with * xmit_oldest pointing to the end of the list. The * list is kept in lru order by moving a state to the * head of the list whenever it is referenced. Since * the list is short and, empirically, the connection * we want is almost always near the front, we locate * states via linear search. If we don't find a state * for the datagram, the oldest state is (re-)used. */ for ( ; ; ) { if( ip->saddr == cs->cs_ip.saddr && ip->daddr == cs->cs_ip.daddr && th->source == cs->cs_tcp.source && th->dest == cs->cs_tcp.dest) goto found; /* if current equal oldest, at end of list */ if ( cs == ocs ) break; lcs = cs; cs = cs->next; comp->sls_o_searches++; } /* * Didn't find it -- re-use oldest cstate. Send an * uncompressed packet that tells the other side what * connection number we're using for this conversation. * * Note that since the state list is circular, the oldest * state points to the newest and we only need to set * xmit_oldest to update the lru linkage. */ comp->sls_o_misses++; comp->xmit_oldest = lcs->cs_this; goto uncompressed; found: /* * Found it -- move to the front on the connection list. */ if(lcs == ocs) { /* found at most recently used */ } else if (cs == ocs) { /* found at least recently used */ comp->xmit_oldest = lcs->cs_this; } else { /* more than 2 elements */ lcs->next = cs->next; cs->next = ocs->next; ocs->next = cs; } /* * Make sure that only what we expect to change changed. * Check the following: * IP protocol version, header length & type of service. * The "Don't fragment" bit. * The time-to-live field. * The TCP header length. * IP options, if any. * TCP options, if any. * If any of these things are different between the previous & * current datagram, we send the current datagram `uncompressed'. */ oth = &cs->cs_tcp; if(ip->version != cs->cs_ip.version || ip->ihl != cs->cs_ip.ihl || ip->tos != cs->cs_ip.tos || (ip->frag_off & htons(0x4000)) != (cs->cs_ip.frag_off & htons(0x4000)) || ip->ttl != cs->cs_ip.ttl || th->doff != cs->cs_tcp.doff || (ip->ihl > 5 && memcmp(ip+1,cs->cs_ipopt,((ip->ihl)-5)*4) != 0) || (th->doff > 5 && memcmp(th+1,cs->cs_tcpopt,((th->doff)-5)*4) != 0)){ goto uncompressed; } /* * Figure out which of the changing fields changed. The * receiver expects changes in the order: urgent, window, * ack, seq (the order minimizes the number of temporaries * needed in this section of code). */ if(th->urg){ deltaS = ntohs(th->urg_ptr); cp = encode(cp,deltaS); changes |= NEW_U; } else if(th->urg_ptr != oth->urg_ptr){ /* argh! URG not set but urp changed -- a sensible * implementation should never do this but RFC793 * doesn't prohibit the change so we have to deal * with it. */ goto uncompressed; } if((deltaS = ntohs(th->window) - ntohs(oth->window)) != 0){ cp = encode(cp,deltaS); changes |= NEW_W; } if((deltaA = ntohl(th->ack_seq) - ntohl(oth->ack_seq)) != 0L){ if(deltaA > 0x0000ffff) goto uncompressed; cp = encode(cp,deltaA); changes |= NEW_A; } if((deltaS = ntohl(th->seq) - ntohl(oth->seq)) != 0L){ if(deltaS > 0x0000ffff) goto uncompressed; cp = encode(cp,deltaS); changes |= NEW_S; } switch(changes){ case 0: /* Nothing changed. If this packet contains data and the * last one didn't, this is probably a data packet following * an ack (normal on an interactive connection) and we send * it compressed. Otherwise it's probably a retransmit, * retransmitted ack or window probe. Send it uncompressed * in case the other side missed the compressed version. */ if(ip->tot_len != cs->cs_ip.tot_len && ntohs(cs->cs_ip.tot_len) == hlen) break; goto uncompressed; case SPECIAL_I: case SPECIAL_D: /* actual changes match one of our special case encodings -- * send packet uncompressed. */ goto uncompressed; case NEW_S|NEW_A: if(deltaS == deltaA && deltaS == ntohs(cs->cs_ip.tot_len) - hlen){ /* special case for echoed terminal traffic */ changes = SPECIAL_I; cp = new_seq; } break; case NEW_S: if(deltaS == ntohs(cs->cs_ip.tot_len) - hlen){ /* special case for data xfer */ changes = SPECIAL_D; cp = new_seq; } break; } deltaS = ntohs(ip->id) - ntohs(cs->cs_ip.id); if(deltaS != 1){ cp = encode(cp,deltaS); changes |= NEW_I; } if(th->psh) changes |= TCP_PUSH_BIT; /* Grab the cksum before we overwrite it below. Then update our * state with this packet's header. */ csum = th->check; memcpy(&cs->cs_ip,ip,20); memcpy(&cs->cs_tcp,th,20); /* We want to use the original packet as our compressed packet. * (cp - new_seq) is the number of bytes we need for compressed * sequence numbers. In addition we need one byte for the change * mask, one for the connection id and two for the tcp checksum. * So, (cp - new_seq) + 4 bytes of header are needed. */ deltaS = cp - new_seq; if(compress_cid == 0 || comp->xmit_current != cs->cs_this){ cp = ocp; *cpp = ocp; *cp++ = changes | NEW_C; *cp++ = cs->cs_this; comp->xmit_current = cs->cs_this; } else { cp = ocp; *cpp = ocp; *cp++ = changes; } *(__sum16 *)cp = csum; cp += 2; /* deltaS is now the size of the change section of the compressed header */ memcpy(cp,new_seq,deltaS); /* Write list of deltas */ memcpy(cp+deltaS,icp+hlen,isize-hlen); comp->sls_o_compressed++; ocp[0] |= SL_TYPE_COMPRESSED_TCP; return isize - hlen + deltaS + (cp - ocp); /* Update connection state cs & send uncompressed packet (i.e., * a regular ip/tcp packet but with the 'conversation id' we hope * to use on future compressed packets in the protocol field). */ uncompressed: memcpy(&cs->cs_ip,ip,20); memcpy(&cs->cs_tcp,th,20); if (ip->ihl > 5) memcpy(cs->cs_ipopt, ip+1, ((ip->ihl) - 5) * 4); if (th->doff > 5) memcpy(cs->cs_tcpopt, th+1, ((th->doff) - 5) * 4); comp->xmit_current = cs->cs_this; comp->sls_o_uncompressed++; memcpy(ocp, icp, isize); *cpp = ocp; ocp[9] = cs->cs_this; ocp[0] |= SL_TYPE_UNCOMPRESSED_TCP; return isize; } int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) { int changes; long x; struct tcphdr *thp; struct iphdr *ip; struct cstate *cs; int len, hdrlen; unsigned char *cp = icp; /* We've got a compressed packet; read the change byte */ comp->sls_i_compressed++; if(isize < 3){ comp->sls_i_error++; return 0; } changes = *cp++; if(changes & NEW_C){ /* Make sure the state index is in range, then grab the state. * If we have a good state index, clear the 'discard' flag. */ x = *cp++; /* Read conn index */ if(x < 0 || x > comp->rslot_limit) goto bad; /* Check if the cstate is initialized */ if (!comp->rstate[x].initialized) goto bad; comp->flags &=~ SLF_TOSS; comp->recv_current = x; } else { /* this packet has an implicit state index. If we've * had a line error since the last time we got an * explicit state index, we have to toss the packet. */ if(comp->flags & SLF_TOSS){ comp->sls_i_tossed++; return 0; } } cs = &comp->rstate[comp->recv_current]; thp = &cs->cs_tcp; ip = &cs->cs_ip; thp->check = *(__sum16 *)cp; cp += 2; thp->psh = (changes & TCP_PUSH_BIT) ? 1 : 0; /* * we can use the same number for the length of the saved header and * the current one, because the packet wouldn't have been sent * as compressed unless the options were the same as the previous one */ hdrlen = ip->ihl * 4 + thp->doff * 4; switch(changes & SPECIALS_MASK){ case SPECIAL_I: /* Echoed terminal traffic */ { short i; i = ntohs(ip->tot_len) - hdrlen; thp->ack_seq = htonl( ntohl(thp->ack_seq) + i); thp->seq = htonl( ntohl(thp->seq) + i); } break; case SPECIAL_D: /* Unidirectional data */ thp->seq = htonl( ntohl(thp->seq) + ntohs(ip->tot_len) - hdrlen); break; default: if(changes & NEW_U){ thp->urg = 1; if((x = decode(&cp)) == -1) { goto bad; } thp->urg_ptr = htons(x); } else thp->urg = 0; if(changes & NEW_W){ if((x = decode(&cp)) == -1) { goto bad; } thp->window = htons( ntohs(thp->window) + x); } if(changes & NEW_A){ if((x = decode(&cp)) == -1) { goto bad; } thp->ack_seq = htonl( ntohl(thp->ack_seq) + x); } if(changes & NEW_S){ if((x = decode(&cp)) == -1) { goto bad; } thp->seq = htonl( ntohl(thp->seq) + x); } break; } if(changes & NEW_I){ if((x = decode(&cp)) == -1) { goto bad; } ip->id = htons (ntohs (ip->id) + x); } else ip->id = htons (ntohs (ip->id) + 1); /* * At this point, cp points to the first byte of data in the * packet. Put the reconstructed TCP and IP headers back on the * packet. Recalculate IP checksum (but not TCP checksum). */ len = isize - (cp - icp); if (len < 0) goto bad; len += hdrlen; ip->tot_len = htons(len); ip->check = 0; memmove(icp + hdrlen, cp, len - hdrlen); cp = icp; memcpy(cp, ip, 20); cp += 20; if (ip->ihl > 5) { memcpy(cp, cs->cs_ipopt, (ip->ihl - 5) * 4); cp += (ip->ihl - 5) * 4; } put_unaligned(ip_fast_csum(icp, ip->ihl), &((struct iphdr *)icp)->check); memcpy(cp, thp, 20); cp += 20; if (thp->doff > 5) { memcpy(cp, cs->cs_tcpopt, ((thp->doff) - 5) * 4); cp += ((thp->doff) - 5) * 4; } return len; bad: comp->sls_i_error++; return slhc_toss( comp ); } int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) { const struct tcphdr *th; unsigned char index; struct iphdr *iph; struct cstate *cs; unsigned int ihl; /* The packet is shorter than a legal IP header. * Also make sure isize is positive. */ if (isize < (int)sizeof(struct iphdr)) { runt: comp->sls_i_runt++; return slhc_toss(comp); } iph = (struct iphdr *)icp; /* Peek at the IP header's IHL field to find its length */ ihl = iph->ihl; /* The IP header length field is too small, * or packet is shorter than the IP header followed * by minimal tcp header. */ if (ihl < 5 || isize < ihl * 4 + sizeof(struct tcphdr)) goto runt; index = iph->protocol; iph->protocol = IPPROTO_TCP; if (ip_fast_csum(icp, ihl)) { /* Bad IP header checksum; discard */ comp->sls_i_badcheck++; return slhc_toss(comp); } if (index > comp->rslot_limit) { comp->sls_i_error++; return slhc_toss(comp); } th = (struct tcphdr *)(icp + ihl * 4); if (th->doff < sizeof(struct tcphdr) / 4) goto runt; if (isize < ihl * 4 + th->doff * 4) goto runt; /* Update local state */ cs = &comp->rstate[comp->recv_current = index]; comp->flags &=~ SLF_TOSS; memcpy(&cs->cs_ip, iph, sizeof(*iph)); memcpy(&cs->cs_tcp, th, sizeof(*th)); if (ihl > 5) memcpy(cs->cs_ipopt, &iph[1], (ihl - 5) * 4); if (th->doff > 5) memcpy(cs->cs_tcpopt, &th[1], (th->doff - 5) * 4); cs->cs_hsize = ihl*2 + th->doff*2; cs->initialized = true; /* Put headers back on packet * Neither header checksum is recalculated */ comp->sls_i_uncompressed++; return isize; } int slhc_toss(struct slcompress *comp) { if ( comp == NULLSLCOMPR ) return 0; comp->flags |= SLF_TOSS; return 0; } #else /* CONFIG_INET */ int slhc_toss(struct slcompress *comp) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_toss"); return -EINVAL; } int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_uncompress"); return -EINVAL; } int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize, unsigned char *ocp, unsigned char **cpp, int compress_cid) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_compress"); return -EINVAL; } int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_remember"); return -EINVAL; } void slhc_free(struct slcompress *comp) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_free"); } struct slcompress * slhc_init(int rslots, int tslots) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_init"); return NULL; } #endif /* CONFIG_INET */ /* VJ header compression */ EXPORT_SYMBOL(slhc_init); EXPORT_SYMBOL(slhc_free); EXPORT_SYMBOL(slhc_remember); EXPORT_SYMBOL(slhc_compress); EXPORT_SYMBOL(slhc_uncompress); EXPORT_SYMBOL(slhc_toss); MODULE_DESCRIPTION("Compression helpers for SLIP (serial line)"); MODULE_LICENSE("Dual BSD/GPL");
3244 6976 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _KERNEL_EVENTS_INTERNAL_H #define _KERNEL_EVENTS_INTERNAL_H #include <linux/hardirq.h> #include <linux/uaccess.h> #include <linux/refcount.h> /* Buffer handling */ #define RING_BUFFER_WRITABLE 0x01 struct perf_buffer { refcount_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ #endif int nr_pages; /* nr of data pages */ int overwrite; /* can overwrite itself */ int paused; /* can write into ring buffer */ atomic_t poll; /* POLL_ for wakeups */ local_t head; /* write position */ unsigned int nest; /* nested writers */ local_t events; /* event limit */ local_t wakeup; /* wakeup stamp */ local_t lost; /* nr records lost */ long watermark; /* wakeup watermark */ long aux_watermark; /* poll crap */ spinlock_t event_lock; struct list_head event_list; atomic_t mmap_count; unsigned long mmap_locked; struct user_struct *mmap_user; /* AUX area */ struct mutex aux_mutex; long aux_head; unsigned int aux_nest; long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; atomic_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; int aux_in_sampling; int aux_in_pause_resume; void **aux_pages; void *aux_priv; struct perf_event_mmap_page *user_page; void *data_pages[]; }; extern void rb_free(struct perf_buffer *rb); static inline void rb_free_rcu(struct rcu_head *rcu_head) { struct perf_buffer *rb; rb = container_of(rcu_head, struct perf_buffer, rcu_head); rb_free(rb); } static inline void rb_toggle_paused(struct perf_buffer *rb, bool pause) { if (!pause && rb->nr_pages) rb->paused = 0; else rb->paused = 1; } extern struct perf_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); extern int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags); extern void rb_free_aux(struct perf_buffer *rb); extern struct perf_buffer *ring_buffer_get(struct perf_event *event); extern void ring_buffer_put(struct perf_buffer *rb); static inline bool rb_has_aux(struct perf_buffer *rb) { return !!rb->aux_nr_pages; } void perf_event_aux_event(struct perf_event *event, unsigned long head, unsigned long size, u64 flags); extern struct page * perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff); #ifdef CONFIG_PERF_USE_VMALLOC /* * Back perf_mmap() with vmalloc memory. * * Required for architectures that have d-cache aliasing issues. */ static inline int page_order(struct perf_buffer *rb) { return rb->page_order; } #else static inline int page_order(struct perf_buffer *rb) { return 0; } #endif static inline int data_page_nr(struct perf_buffer *rb) { return rb->nr_pages << page_order(rb); } static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } static inline unsigned long perf_aux_size(struct perf_buffer *rb) { return (unsigned long)rb->aux_nr_pages << PAGE_SHIFT; } #define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \ { \ unsigned long size, written; \ \ do { \ size = min(handle->size, len); \ written = memcpy_func(__VA_ARGS__); \ written = size - written; \ \ len -= written; \ handle->addr += written; \ if (advance_buf) \ buf += written; \ handle->size -= written; \ if (!handle->size) { \ struct perf_buffer *rb = handle->rb; \ \ handle->page++; \ handle->page &= rb->nr_pages - 1; \ handle->addr = rb->data_pages[handle->page]; \ handle->size = PAGE_SIZE << page_order(rb); \ } \ } while (len && written == size); \ \ return len; \ } #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ static inline unsigned long \ func_name(struct perf_output_handle *handle, \ const void *buf, unsigned long len) \ __DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size) static inline unsigned long __output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, const void *buf, unsigned long len) { unsigned long orig_len = len; __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf, orig_len - len, size) } static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n) { memcpy(dst, src, n); return 0; } DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) static inline unsigned long memcpy_skip(void *dst, const void *src, unsigned long n) { return 0; } DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip) #ifndef arch_perf_out_copy_user #define arch_perf_out_copy_user arch_perf_out_copy_user static inline unsigned long arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) { unsigned long ret; pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, n); pagefault_enable(); return ret; } #endif DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(u8 *recursion) { unsigned char rctx = interrupt_context_level(); if (recursion[rctx]) return -1; recursion[rctx]++; barrier(); return rctx; } static inline void put_recursion_context(u8 *recursion, unsigned char rctx) { barrier(); recursion[rctx]--; } #ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP static inline bool arch_perf_have_user_stack_dump(void) { return true; } #define perf_user_stack_pointer(regs) user_stack_pointer(regs) #else static inline bool arch_perf_have_user_stack_dump(void) { return false; } #define perf_user_stack_pointer(regs) 0 #endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */ #endif /* _KERNEL_EVENTS_INTERNAL_H */
13 191 16 5 5 5 61 62 31 63 64 64 64 8 125 34 34 58 28 5 1 15 15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #ifndef _LINUX_SKMSG_H #define _LINUX_SKMSG_H #include <linux/bpf.h> #include <linux/filter.h> #include <linux/scatterlist.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp.h> #include <net/strparser.h> #define MAX_MSG_FRAGS MAX_SKB_FRAGS #define NR_MSG_FRAG_IDS (MAX_MSG_FRAGS + 1) enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, __SK_NONE, }; struct sk_msg_sg { u32 start; u32 curr; u32 end; u32 size; u32 copybreak; DECLARE_BITMAP(copy, MAX_MSG_FRAGS + 2); /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the * chaining; * 2) to chain tailer SG entries after the message. */ struct scatterlist data[MAX_MSG_FRAGS + 2]; }; /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; void *data_end; u32 apply_bytes; u32 cork_bytes; u32 flags; struct sk_buff *skb; struct sock *sk_redir; struct sock *sk; struct list_head list; }; struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; struct bpf_prog *skb_verdict; struct bpf_link *msg_parser_link; struct bpf_link *stream_parser_link; struct bpf_link *stream_verdict_link; struct bpf_link *skb_verdict_link; }; enum sk_psock_state_bits { SK_PSOCK_TX_ENABLED, SK_PSOCK_RX_STRP_ENABLED, }; struct sk_psock_link { struct list_head list; struct bpf_map *map; void *link_raw; }; struct sk_psock_work_state { u32 len; u32 off; }; struct sk_psock { struct sock *sk; struct sock *sk_redir; u32 apply_bytes; u32 cork_bytes; u32 eval; bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; refcount_t refcnt; void (*saved_unhash)(struct sock *sk); void (*saved_destroy)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); /* psock_update_sk_prot may be called with restore=false many times * so the handler must be safe for this case. It will be called * exactly once with restore=true when the psock is being destroyed * and psock refcnt is zero, but before an RCU grace period. */ int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; struct delayed_work work; struct sock *sk_pair; struct rcu_work rwork; }; int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce); int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, u32 off, u32 len); void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); int sk_msg_free(struct sock *sk, struct sk_msg *msg); int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { WARN_ON(i == msg->sg.end && bytes); } static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) { if (psock->apply_bytes) { if (psock->apply_bytes < bytes) psock->apply_bytes = 0; else psock->apply_bytes -= bytes; } } static inline u32 sk_msg_iter_dist(u32 start, u32 end) { return end >= start ? end - start : end + (NR_MSG_FRAG_IDS - start); } #define sk_msg_iter_var_prev(var) \ do { \ if (var == 0) \ var = NR_MSG_FRAG_IDS - 1; \ else \ var--; \ } while (0) #define sk_msg_iter_var_next(var) \ do { \ var++; \ if (var == NR_MSG_FRAG_IDS) \ var = 0; \ } while (0) #define sk_msg_iter_prev(msg, which) \ sk_msg_iter_var_prev(msg->sg.which) #define sk_msg_iter_next(msg, which) \ sk_msg_iter_var_next(msg->sg.which) static inline void sk_msg_init(struct sk_msg *msg) { BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); memset(msg, 0, sizeof(*msg)); sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); } static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, int which, u32 size) { dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; dst->sg.size += size; src->sg.size -= size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) { memcpy(dst, src, sizeof(*src)); sk_msg_init(src); } static inline bool sk_msg_full(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end) == MAX_MSG_FRAGS; } static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end); } static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) { return &msg->sg.data[which]; } static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) { return msg->sg.data[which]; } static inline struct page *sk_msg_page(struct sk_msg *msg, int which) { return sg_page(sk_msg_elem(msg, which)); } static inline bool sk_msg_to_ingress(const struct sk_msg *msg) { return msg->flags & BPF_F_INGRESS; } static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); if (test_bit(msg->sg.start, msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { msg->data = sg_virt(sge); msg->data_end = msg->data + sge->length; } } static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, u32 len, u32 offset) { struct scatterlist *sge; get_page(page); sge = sk_msg_elem(msg, msg->sg.end); sg_set_page(sge, page, len, offset); sg_unmark_end(sge); __set_bit(msg->sg.end, msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { if (copy_state) __set_bit(i, msg->sg.copy); else __clear_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; } while (1); } static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, true); } static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, false); } static inline struct sk_psock *sk_psock(const struct sock *sk) { return __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_PSOCK); } static inline void sk_psock_set_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { set_bit(bit, &psock->state); } static inline void sk_psock_clear_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { clear_bit(bit, &psock->state); } static inline bool sk_psock_test_state(const struct sk_psock *psock, enum sk_psock_state_bits bit) { return test_bit(bit, &psock->state); } static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); kfree_skb(skb); } static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) list_add_tail(&msg->list, &psock->ingress_msg); else { sk_msg_free(psock->sk, msg); kfree(msg); } spin_unlock_bh(&psock->ingress_lock); } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); if (msg) list_del(&msg->list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, struct sk_msg *msg) { struct sk_msg *ret; spin_lock_bh(&psock->ingress_lock); if (list_is_last(&msg->list, &psock->ingress_msg)) ret = NULL; else ret = list_next_entry(msg, list); spin_unlock_bh(&psock->ingress_lock); return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) { return psock ? list_empty(&psock->ingress_msg) : true; } static inline void kfree_sk_msg(struct sk_msg *msg) { if (msg->skb) consume_skb(msg->skb); kfree(msg); } static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; sk->sk_err = err; sk_error_report(sk); } struct sk_psock *sk_psock_init(struct sock *sk, int node); void sk_psock_stop(struct sk_psock *psock); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); #else static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { return -EOPNOTSUPP; } static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { } static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { } #endif void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); /* * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). The typecast is * intentional to enforce typesafety. */ #define sk_psock_init_link() \ ((struct sk_psock_link *)kzalloc(sizeof(struct sk_psock_link), \ GFP_ATOMIC | __GFP_NOWARN)) static inline void sk_psock_free_link(struct sk_psock_link *link) { kfree(link); } struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { sk_msg_free(psock->sk, psock->cork); kfree(psock->cork); psock->cork = NULL; } } static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, true); } static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock && !refcount_inc_not_zero(&psock->refcnt)) psock = NULL; rcu_read_unlock(); return psock; } void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) { if (refcount_dec_and_test(&psock->refcnt)) sk_psock_drop(sk, psock); } static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { read_lock_bh(&sk->sk_callback_lock); if (psock->saved_data_ready) psock->saved_data_ready(sk); else sk->sk_data_ready(sk); read_unlock_bh(&sk->sk_callback_lock); } static inline void psock_set_prog(struct bpf_prog **pprog, struct bpf_prog *prog) { prog = xchg(pprog, prog); if (prog) bpf_prog_put(prog); } static inline int psock_replace_prog(struct bpf_prog **pprog, struct bpf_prog *prog, struct bpf_prog *old) { if (cmpxchg(pprog, old, prog) != old) return -ENOENT; if (old) bpf_prog_put(old); return 0; } static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; return !!psock->saved_data_ready; } #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) /* We only have two bits so far. */ #define BPF_F_PTR_MASK ~(BPF_F_INGRESS | BPF_F_STRPARSER) static inline bool skb_bpf_strparser(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_STRPARSER; } static inline void skb_bpf_set_strparser(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_STRPARSER; } static inline bool skb_bpf_ingress(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_INGRESS; } static inline void skb_bpf_set_ingress(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_INGRESS; } static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, bool ingress) { skb->_sk_redir = (unsigned long)sk_redir; if (ingress) skb->_sk_redir |= BPF_F_INGRESS; } static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return (struct sock *)(sk_redir & BPF_F_PTR_MASK); } static inline void skb_bpf_redirect_clear(struct sk_buff *skb) { skb->_sk_redir = 0; } #endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */
26 353 354 5 353 353 353 338 341 313 59 341 339 341 87 89 89 89 89 308 1 312 307 306 5235 5211 307 312 308 310 312 26 26 26 26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. * Authors: David Chinner and Glauber Costa * * Generic LRU infrastructure */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/list_lru.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/memcontrol.h> #include "slab.h" #include "internal.h" #ifdef CONFIG_MEMCG static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); static inline bool list_lru_memcg_aware(struct list_lru *lru) { return lru->memcg_aware; } static void list_lru_register(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_add(&lru->list, &memcg_list_lrus); mutex_unlock(&list_lrus_mutex); } static void list_lru_unregister(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_del(&lru->list); mutex_unlock(&list_lrus_mutex); } static int lru_shrinker_id(struct list_lru *lru) { return lru->shrinker_id; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { if (list_lru_memcg_aware(lru) && idx >= 0) { struct list_lru_memcg *mlru = xa_load(&lru->xa, idx); return mlru ? &mlru->node[nid] : NULL; } return &lru->node[nid].lru; } static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l; long nr_items; rcu_read_lock(); again: l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); if (likely(l)) { if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); nr_items = READ_ONCE(l->nr_items); if (likely(nr_items != LONG_MIN)) { WARN_ON(nr_items < 0); rcu_read_unlock(); return l; } if (irq) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); } /* * Caller may simply bail out if raced with reparenting or * may iterate through the list_lru and expect empty slots. */ if (skip_empty) { rcu_read_unlock(); return NULL; } VM_WARN_ON(!css_is_dying(&memcg->css)); memcg = parent_mem_cgroup(memcg); goto again; } static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) { if (irq_off) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); } #else static void list_lru_register(struct list_lru *lru) { } static void list_lru_unregister(struct list_lru *lru) { } static int lru_shrinker_id(struct list_lru *lru) { return -1; } static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l = &lru->node[nid].lru; if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); return l; } static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) { if (irq_off) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); } #endif /* CONFIG_MEMCG */ /* The caller must ensure the memcg lifetime. */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); if (!l) return false; if (list_empty(item)) { list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); unlock_list_lru(l, false); atomic_long_inc(&nlru->nr_items); return true; } unlock_list_lru(l, false); return false; } bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { bool ret; int nid = page_to_nid(virt_to_page(item)); if (list_lru_memcg_aware(lru)) { rcu_read_lock(); ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item)); rcu_read_unlock(); } else { ret = list_lru_add(lru, item, nid, NULL); } return ret; } EXPORT_SYMBOL_GPL(list_lru_add_obj); /* The caller must ensure the memcg lifetime. */ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); if (!l) return false; if (!list_empty(item)) { list_del_init(item); l->nr_items--; unlock_list_lru(l, false); atomic_long_dec(&nlru->nr_items); return true; } unlock_list_lru(l, false); return false; } bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { bool ret; int nid = page_to_nid(virt_to_page(item)); if (list_lru_memcg_aware(lru)) { rcu_read_lock(); ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item)); rcu_read_unlock(); } else { ret = list_lru_del(lru, item, nid, NULL); } return ret; } EXPORT_SYMBOL_GPL(list_lru_del_obj); void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head) { list_move(item, head); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate_move); unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { struct list_lru_one *l; long count; rcu_read_lock(); l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); count = l ? READ_ONCE(l->nr_items) : 0; rcu_read_unlock(); if (unlikely(count < 0)) count = 0; return count; } EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) { struct list_lru_node *nlru; nlru = &lru->node[nid]; return atomic_long_read(&nlru->nr_items); } EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk, bool irq_off) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l = NULL; struct list_head *item, *n; unsigned long isolated = 0; restart: l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); if (!l) return isolated; list_for_each_safe(item, n, &l->list) { enum lru_status ret; /* * decrement nr_to_walk first so that we don't livelock if we * get stuck on large numbers of LRU_RETRY items */ if (!*nr_to_walk) break; --*nr_to_walk; ret = isolate(item, l, cb_arg); switch (ret) { /* * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru * lock. List traversal will have to restart from scratch. */ case LRU_RETRY: goto restart; case LRU_REMOVED_RETRY: fallthrough; case LRU_REMOVED: isolated++; atomic_long_dec(&nlru->nr_items); if (ret == LRU_REMOVED_RETRY) goto restart; break; case LRU_ROTATE: list_move_tail(item, &l->list); break; case LRU_SKIP: break; case LRU_STOP: goto out; default: BUG(); } } unlock_list_lru(l, irq_off); out: return isolated; } unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { return __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, false); } EXPORT_SYMBOL_GPL(list_lru_walk_one); unsigned long list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { return __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, true); } unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { long isolated = 0; isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, nr_to_walk); #ifdef CONFIG_MEMCG if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; struct mem_cgroup *memcg; unsigned long index; xa_for_each(&lru->xa, index, mlru) { rcu_read_lock(); memcg = mem_cgroup_from_id(index); if (!mem_cgroup_tryget(memcg)) { rcu_read_unlock(); continue; } rcu_read_unlock(); isolated += __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, false); mem_cgroup_put(memcg); if (*nr_to_walk <= 0) break; } } #endif return isolated; } EXPORT_SYMBOL_GPL(list_lru_walk_node); static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) { INIT_LIST_HEAD(&l->list); spin_lock_init(&l->lock); l->nr_items = 0; #ifdef CONFIG_LOCKDEP if (lru->key) lockdep_set_class(&l->lock, lru->key); #endif } #ifdef CONFIG_MEMCG static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) { int nid; struct list_lru_memcg *mlru; mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp); if (!mlru) return NULL; for_each_node(nid) init_one_lru(lru, &mlru->node[nid]); return mlru; } static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { if (memcg_aware) xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); lru->memcg_aware = memcg_aware; } static void memcg_destroy_list_lru(struct list_lru *lru) { XA_STATE(xas, &lru->xa, 0); struct list_lru_memcg *mlru; if (!list_lru_memcg_aware(lru)) return; xas_lock_irq(&xas); xas_for_each(&xas, mlru, ULONG_MAX) { kfree(mlru); xas_store(&xas, NULL); } xas_unlock_irq(&xas); } static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, struct list_lru_one *src, struct mem_cgroup *dst_memcg) { int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *dst; spin_lock_irq(&src->lock); dst = list_lru_from_memcg_idx(lru, nid, dst_idx); spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); list_splice_init(&src->list, &dst->list); if (src->nr_items) { dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); } /* Mark the list_lru_one dead */ src->nr_items = LONG_MIN; spin_unlock(&dst->lock); spin_unlock_irq(&src->lock); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { struct list_lru *lru; int i; mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &memcg_list_lrus, list) { struct list_lru_memcg *mlru; XA_STATE(xas, &lru->xa, memcg->kmemcg_id); /* * Lock the Xarray to ensure no on going list_lru_memcg * allocation and further allocation will see css_is_dying(). */ xas_lock_irq(&xas); mlru = xas_store(&xas, NULL); xas_unlock_irq(&xas); if (!mlru) continue; /* * With Xarray value set to NULL, holding the lru lock below * prevents list_lru_{add,del,isolate} from touching the lru, * safe to reparent. */ for_each_node(i) memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); /* * Here all list_lrus corresponding to the cgroup are guaranteed * to remain empty, we can safely free this lru, any further * memcg_list_lru_alloc() call will simply bail out. */ kvfree_rcu(mlru, rcu); } mutex_unlock(&list_lrus_mutex); } static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, struct list_lru *lru) { int idx = memcg->kmemcg_id; return idx < 0 || xa_load(&lru->xa, idx); } int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { unsigned long flags; struct list_lru_memcg *mlru; struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) return 0; gfp &= GFP_RECLAIM_MASK; /* * Because the list_lru can be reparented to the parent cgroup's * list_lru, we should make sure that this cgroup and all its * ancestors have allocated list_lru_memcg. */ do { /* * Keep finding the farest parent that wasn't populated * until found memcg itself. */ pos = memcg; parent = parent_mem_cgroup(pos); while (!memcg_list_lru_allocated(parent, lru)) { pos = parent; parent = parent_mem_cgroup(pos); } mlru = memcg_init_list_lru_one(lru, gfp); if (!mlru) return -ENOMEM; xas_set(&xas, pos->kmemcg_id); do { xas_lock_irqsave(&xas, flags); if (!xas_load(&xas) && !css_is_dying(&pos->css)) { xas_store(&xas, mlru); if (!xas_error(&xas)) mlru = NULL; } xas_unlock_irqrestore(&xas, flags); } while (xas_nomem(&xas, gfp)); if (mlru) kfree(mlru); } while (pos != memcg && !css_is_dying(&pos->css)); return xas_error(&xas); } #else static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { } static void memcg_destroy_list_lru(struct list_lru *lru) { } #endif /* CONFIG_MEMCG */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker) { int i; #ifdef CONFIG_MEMCG if (shrinker) lru->shrinker_id = shrinker->id; else lru->shrinker_id = -1; if (mem_cgroup_kmem_disabled()) memcg_aware = false; #endif lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); if (!lru->node) return -ENOMEM; for_each_node(i) init_one_lru(lru, &lru->node[i].lru); memcg_init_list_lru(lru, memcg_aware); list_lru_register(lru); return 0; } EXPORT_SYMBOL_GPL(__list_lru_init); void list_lru_destroy(struct list_lru *lru) { /* Already destroyed or not yet initialized? */ if (!lru->node) return; list_lru_unregister(lru); memcg_destroy_list_lru(lru); kfree(lru->node); lru->node = NULL; #ifdef CONFIG_MEMCG lru->shrinker_id = -1; #endif } EXPORT_SYMBOL_GPL(list_lru_destroy);
13 13 41 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 // SPDX-License-Identifier: GPL-2.0 /* * Wifi Band Exclusion Interface for WLAN * Copyright (C) 2023 Advanced Micro Devices * */ #include <linux/acpi_amd_wbrf.h> #include <linux/units.h> #include <net/cfg80211.h> #include "ieee80211_i.h" void ieee80211_check_wbrf_support(struct ieee80211_local *local) { struct wiphy *wiphy = local->hw.wiphy; struct device *dev; if (!wiphy) return; dev = wiphy->dev.parent; if (!dev) return; local->wbrf_supported = acpi_amd_wbrf_supported_producer(dev); } static void get_chan_freq_boundary(u32 center_freq, u32 bandwidth, u64 *start, u64 *end) { bandwidth *= KHZ_PER_MHZ; center_freq *= KHZ_PER_MHZ; *start = center_freq - bandwidth / 2; *end = center_freq + bandwidth / 2; /* Frequency in Hz is expected */ *start = *start * HZ_PER_KHZ; *end = *end * HZ_PER_KHZ; } static void get_ranges_from_chandef(struct cfg80211_chan_def *chandef, struct wbrf_ranges_in_out *ranges_in) { u64 start_freq1, end_freq1; u64 start_freq2, end_freq2; int bandwidth; bandwidth = nl80211_chan_width_to_mhz(chandef->width); get_chan_freq_boundary(chandef->center_freq1, bandwidth, &start_freq1, &end_freq1); ranges_in->band_list[0].start = start_freq1; ranges_in->band_list[0].end = end_freq1; ranges_in->num_of_ranges = 1; if (chandef->width == NL80211_CHAN_WIDTH_80P80) { get_chan_freq_boundary(chandef->center_freq2, bandwidth, &start_freq2, &end_freq2); ranges_in->band_list[1].start = start_freq2; ranges_in->band_list[1].end = end_freq2; ranges_in->num_of_ranges++; } } void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef) { struct wbrf_ranges_in_out ranges_in = {0}; struct device *dev; if (!local->wbrf_supported) return; dev = local->hw.wiphy->dev.parent; get_ranges_from_chandef(chandef, &ranges_in); acpi_amd_wbrf_add_remove(dev, WBRF_RECORD_ADD, &ranges_in); } void ieee80211_remove_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef) { struct wbrf_ranges_in_out ranges_in = {0}; struct device *dev; if (!local->wbrf_supported) return; dev = local->hw.wiphy->dev.parent; get_ranges_from_chandef(chandef, &ranges_in); acpi_amd_wbrf_add_remove(dev, WBRF_RECORD_REMOVE, &ranges_in); }
33 33 1 1 33 33 33 34 33 1 33 1 1 33 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/plist.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/freezer.h> #include "futex.h" /* * READ this before attempting to hack on futexes! * * Basic futex operation and ordering guarantees * ============================================= * * The waiter reads the futex value in user space and calls * futex_wait(). This function computes the hash bucket and acquires * the hash bucket lock. After that it reads the futex user space value * again and verifies that the data has not changed. If it has not changed * it enqueues itself into the hash bucket, releases the hash bucket lock * and schedules. * * The waker side modifies the user space value of the futex and calls * futex_wake(). This function computes the hash bucket and acquires the * hash bucket lock. Then it looks for waiters on that futex in the hash * bucket and wakes them. * * In futex wake up scenarios where no tasks are blocked on a futex, taking * the hb spinlock can be avoided and simply return. In order for this * optimization to work, ordering guarantees must exist so that the waiter * being added to the list is acknowledged when the list is concurrently being * checked by the waker, avoiding scenarios like the following: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * uval = *futex; * *futex = newval; * sys_futex(WAKE, futex); * futex_wake(futex); * if (queue_empty()) * return; * if (uval == val) * lock(hash_bucket(futex)); * queue(); * unlock(hash_bucket(futex)); * schedule(); * * This would cause the waiter on CPU 0 to wait forever because it * missed the transition of the user space value from val to newval * and the waker did not find the waiter in the hash bucket queue. * * The correct serialization ensures that a waiter either observes * the changed user space value before blocking or is woken by a * concurrent waker: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * * waiters++; (a) * smp_mb(); (A) <-- paired with -. * | * lock(hash_bucket(futex)); | * | * uval = *futex; | * | *futex = newval; * | sys_futex(WAKE, futex); * | futex_wake(futex); * | * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); * schedule(); if (waiters) * lock(hash_bucket(futex)); * else wake_waiters(futex); * waiters--; (b) unlock(hash_bucket(futex)); * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write * to futex and the waiters read (see futex_hb_waiters_pending()). * * This yields the following case (where X:=waiters, Y:=futex): * * X = Y = 0 * * w[X]=1 w[Y]=1 * MB MB * r[Y]=y r[X]=x * * Which guarantees that x==0 && y==0 is impossible; which translates back into * the guarantee that we cannot both miss the futex variable change and the * enqueue. * * Note that a new waiter is accounted for in (a) even when it is possible that * the wait call can return error, in which case we backtrack from it in (b). * Refer to the comment in futex_q_lock(). * * Similarly, in order to account for waiters being requeued on another * address we always increment the waiters for the destination bucket before * acquiring the lock. It then decrements them again after releasing it - * the code that actually moves the futex(es) between hash buckets (requeue_futex) * will do the additional required waiter count housekeeping. This is done for * double_lock_hb() and double_unlock_hb(), respectively. */ bool __futex_wake_mark(struct futex_q *q) { if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return false; __futex_unqueue(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL * is written, without taking any locks. This is possible in the event * of a spurious wakeup, for example. A memory barrier is required here * to prevent the following store to lock_ptr from getting ahead of the * plist_del in __futex_unqueue(). */ smp_store_release(&q->lock_ptr, NULL); return true; } /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. Callers * must ensure to later call wake_up_q() for the actual * wakeups to occur. */ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) { struct task_struct *p = q->task; get_task_struct(p); if (!__futex_wake_mark(q)) { put_task_struct(p); return; } /* * Queue the task for later wakeup for after we've released * the hb->lock. */ wake_q_add_safe(wake_q, p); } /* * Wake up waiters matching bitset queued on this futex (uaddr). */ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; DEFINE_WAKE_Q(wake_q); int ret; if (!bitset) return -EINVAL; ret = get_futex_key(uaddr, flags, &key, FUTEX_READ); if (unlikely(ret != 0)) return ret; if ((flags & FLAGS_STRICT) && !nr_wake) return 0; hb = futex_hash(&key); /* Make sure we really have tasks to wakeup */ if (!futex_hb_waiters_pending(hb)) return ret; spin_lock(&hb->lock); plist_for_each_entry_safe(this, next, &hb->chain, list) { if (futex_match (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } /* Check if one of the bits is set in both bitsets */ if (!(this->bitset & bitset)) continue; this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } spin_unlock(&hb->lock); wake_up_q(&wake_q); return ret; } static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) { unsigned int op = (encoded_op & 0x70000000) >> 28; unsigned int cmp = (encoded_op & 0x0f000000) >> 24; int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { if (oparg < 0 || oparg > 31) { char comm[sizeof(current->comm)]; /* * kill this print and return -EINVAL when userspace * is sane again */ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", get_task_comm(comm, current), oparg); oparg &= 31; } oparg = 1 << oparg; } pagefault_disable(); ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); pagefault_enable(); if (ret) return ret; switch (cmp) { case FUTEX_OP_CMP_EQ: return oldval == cmparg; case FUTEX_OP_CMP_NE: return oldval != cmparg; case FUTEX_OP_CMP_LT: return oldval < cmparg; case FUTEX_OP_CMP_GE: return oldval >= cmparg; case FUTEX_OP_CMP_LE: return oldval <= cmparg; case FUTEX_OP_CMP_GT: return oldval > cmparg; default: return -ENOSYS; } } /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; DEFINE_WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) return ret; hb1 = futex_hash(&key1); hb2 = futex_hash(&key2); retry_private: double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { double_unlock_hb(hb1, hb2); if (!IS_ENABLED(CONFIG_MMU) || unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { /* * we don't get EFAULT from MMU faults if we don't have * an MMU, but we might get them from range checking */ ret = op_ret; return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) return ret; } cond_resched(); if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (futex_match (&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } if (op_ret > 0) { op_ret = 0; plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (futex_match (&this->key, &key2)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++op_ret >= nr_wake2) break; } } ret += op_ret; } out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); return ret; } static long futex_wait_restart(struct restart_block *restart); /** * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal * @hb: the futex hash bucket, must be locked by the caller * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout */ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) { /* * The task state is guaranteed to be set before another task can * wake it. set_current_state() is implemented using smp_store_mb() and * futex_queue() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb); /* Arm the timer */ if (timeout) hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); /* * If we have been removed from the hash list, then another task * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { /* * If the timer has already expired, current will already be * flagged for rescheduling. Only call schedule if there * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) schedule(); } __set_current_state(TASK_RUNNING); } /** * futex_unqueue_multiple - Remove various futexes from their hash bucket * @v: The list of futexes to unqueue * @count: Number of futexes in the list * * Helper to unqueue a list of futexes. This can't fail. * * Return: * - >=0 - Index of the last futex that was awoken; * - -1 - No futex was awoken */ int futex_unqueue_multiple(struct futex_vector *v, int count) { int ret = -1, i; for (i = 0; i < count; i++) { if (!futex_unqueue(&v[i].q)) ret = i; } return ret; } /** * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes * @vs: The futex list to wait on * @count: The size of the list * @woken: Index of the last woken futex, if any. Used to notify the * caller that it can return this index to userspace (return parameter) * * Prepare multiple futexes in a single step and enqueue them. This may fail if * the futex list is invalid or if any futex was already awoken. On success the * task is ready to interruptible sleep. * * Return: * - 1 - One of the futexes was woken by another thread * - 0 - Success * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL */ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) { struct futex_hash_bucket *hb; bool retry = false; int ret, i; u32 uval; /* * Enqueuing multiple futexes is tricky, because we need to enqueue * each futex on the list before dealing with the next one to avoid * deadlocking on the hash bucket. But, before enqueuing, we need to * make sure that current->state is TASK_INTERRUPTIBLE, so we don't * lose any wake events, which cannot be done before the get_futex_key * of the next key, because it calls get_user_pages, which can sleep. * Thus, we fetch the list of futexes keys in two steps, by first * pinning all the memory keys in the futex key, and only then we read * each key and queue the corresponding futex. * * Private futexes doesn't need to recalculate hash in retry, so skip * get_futex_key() when retrying. */ retry: for (i = 0; i < count; i++) { if (!(vs[i].w.flags & FLAGS_SHARED) && retry) continue; ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), vs[i].w.flags, &vs[i].q.key, FUTEX_READ); if (unlikely(ret)) return ret; } set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; struct futex_q *q = &vs[i].q; u32 val = vs[i].w.val; hb = futex_q_lock(q); ret = futex_get_value_locked(&uval, uaddr); if (!ret && uval == val) { /* * The bucket lock can't be held while dealing with the * next futex. Queue each futex at this moment so hb can * be unlocked. */ futex_queue(q, hb); continue; } futex_q_unlock(hb); __set_current_state(TASK_RUNNING); /* * Even if something went wrong, if we find out that a futex * was woken, we don't return error and return this index to * userspace */ *woken = futex_unqueue_multiple(vs, i); if (*woken >= 0) return 1; if (ret) { /* * If we need to handle a page fault, we need to do so * without any lock and any enqueued futex (otherwise * we could lose some wakeup). So we do it here, after * undoing all the work done so far. In success, we * retry all the work. */ if (get_user(uval, uaddr)) return -EFAULT; retry = true; goto retry; } if (uval != val) return -EWOULDBLOCK; } return 0; } /** * futex_sleep_multiple - Check sleeping conditions and sleep * @vs: List of futexes to wait for * @count: Length of vs * @to: Timeout * * Sleep if and only if the timeout hasn't expired and no futex on the list has * been woken up. */ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { if (to && !to->task) return; for (; count; count--, vs++) { if (!READ_ONCE(vs->q.lock_ptr)) return; } schedule(); } /** * futex_wait_multiple - Prepare to wait on and enqueue several futexes * @vs: The list of futexes to wait on * @count: The number of objects * @to: Timeout before giving up and returning to userspace * * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function * sleeps on a group of futexes and returns on the first futex that is * wake, or after the timeout has elapsed. * * Return: * - >=0 - Hint to the futex that was awoken * - <0 - On error */ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { int ret, hint = 0; if (to) hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); while (1) { ret = futex_wait_multiple_setup(vs, count, &hint); if (ret) { if (ret > 0) { /* A futex was woken during setup */ ret = hint; } return ret; } futex_sleep_multiple(vs, count, to); __set_current_state(TASK_RUNNING); ret = futex_unqueue_multiple(vs, count); if (ret >= 0) return ret; if (to && !to->task) return -ETIMEDOUT; else if (signal_pending(current)) return -ERESTARTSYS; /* * The final case is a spurious wakeup, for * which just retry. */ } } /** * futex_wait_setup() - Prepare to wait on a futex * @uaddr: the futex userspace address * @val: the expected value * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q * @hb: storage for hash_bucket pointer to be returned to caller * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. * Return with the hb lock held on success, and unlocked on failure. * * Return: * - 0 - uaddr contains val and hb has been locked; * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) { u32 uval; int ret; /* * Access the page AFTER the hash-bucket is locked. * Order is important: * * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for * any cond. If we locked the hash-bucket after testing *uaddr, that * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * * On the other hand, we insert q and release the hash-bucket only * after testing *uaddr. This guarantees that futex_wait() will NOT * absorb a wakeup if *uaddr does not match the desired values * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; retry_private: *hb = futex_q_lock(q); ret = futex_get_value_locked(&uval, uaddr); if (ret) { futex_q_unlock(*hb); ret = get_user(uval, uaddr); if (ret) return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } if (uval != val) { futex_q_unlock(*hb); ret = -EWOULDBLOCK; } return ret; } int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset) { struct futex_q q = futex_q_init; struct futex_hash_bucket *hb; int ret; if (!bitset) return -EINVAL; q.bitset = bitset; retry: /* * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ futex_wait_queue(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ if (!futex_unqueue(&q)) return 0; if (to && !to->task) return -ETIMEDOUT; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ if (!signal_pending(current)) goto retry; return -ERESTARTSYS; } int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { struct hrtimer_sleeper timeout, *to; struct restart_block *restart; int ret; to = futex_setup_timer(abs_time, &timeout, flags, current->timer_slack_ns); ret = __futex_wait(uaddr, flags, val, to, bitset); /* No timeout, nothing to clean up. */ if (!to) return ret; hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); if (ret == -ERESTARTSYS) { restart = &current->restart_block; restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = *abs_time; restart->futex.bitset = bitset; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; return set_restart_fn(restart, futex_wait_restart); } return ret; } static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { t = restart->futex.time; tp = &t; } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val, tp, restart->futex.bitset); }
28 7 1 9 1 14 2 12 19 500 467 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 // SPDX-License-Identifier: GPL-2.0-only #include <linux/netlink.h> #include <linux/nospec.h> #include <linux/rtnetlink.h> #include <linux/types.h> #include <net/ip.h> #include <net/net_namespace.h> #include <net/tcp.h> static int ip_metrics_convert(struct nlattr *fc_mx, int fc_mx_len, u32 *metrics, struct netlink_ext_ack *extack) { bool ecn_ca = false; struct nlattr *nla; int remaining; nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) { int type = nla_type(nla); u32 val; if (!type) continue; if (type > RTAX_MAX) { NL_SET_ERR_MSG(extack, "Invalid metric type"); return -EINVAL; } type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) { NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); return -EINVAL; } } else { if (nla_len(nla) != sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid attribute in metrics"); return -EINVAL; } val = nla_get_u32(nla); } if (type == RTAX_ADVMSS && val > 65535 - 40) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) val = 65535 - 15; if (type == RTAX_HOPLIMIT && val > 255) val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) { NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute"); return -EINVAL; } metrics[type - 1] = val; } if (ecn_ca) metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; return 0; } struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack) { struct dst_metrics *fib_metrics; int err; if (!fc_mx) return (struct dst_metrics *)&dst_default_metrics; fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL); if (unlikely(!fib_metrics)) return ERR_PTR(-ENOMEM); err = ip_metrics_convert(fc_mx, fc_mx_len, fib_metrics->metrics, extack); if (!err) { refcount_set(&fib_metrics->refcnt, 1); } else { kfree(fib_metrics); fib_metrics = ERR_PTR(err); } return fib_metrics; } EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Definitions for the IPPROTO_SMC (socket related) * * Copyright IBM Corp. 2016, 2018 * Copyright (c) 2024, Alibaba Inc. * * Author: D. Wythe <alibuda@linux.alibaba.com> */ #include <net/protocol.h> #include <net/sock.h> #include "smc_inet.h" #include "smc.h" static int smc_inet_init_sock(struct sock *sk); static struct proto smc_inet_prot = { .name = "INET_SMC", .owner = THIS_MODULE, .init = smc_inet_init_sock, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; static const struct proto_ops smc_inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = smc_release, .bind = smc_bind, .connect = smc_connect, .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, .setsockopt = smc_setsockopt, .getsockopt = smc_getsockopt, .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, .splice_read = smc_splice_read, }; static struct inet_protosw smc_inet_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SMC, .prot = &smc_inet_prot, .ops = &smc_inet_stream_ops, .flags = INET_PROTOSW_ICSK, }; #if IS_ENABLED(CONFIG_IPV6) struct smc6_sock { struct smc_sock smc; struct ipv6_pinfo inet6; }; static struct proto smc_inet6_prot = { .name = "INET6_SMC", .owner = THIS_MODULE, .init = smc_inet_init_sock, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc6_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, .ipv6_pinfo_offset = offsetof(struct smc6_sock, inet6), }; static const struct proto_ops smc_inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = smc_release, .bind = smc_bind, .connect = smc_connect, .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, .setsockopt = smc_setsockopt, .getsockopt = smc_getsockopt, .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, .splice_read = smc_splice_read, }; static struct inet_protosw smc_inet6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SMC, .prot = &smc_inet6_prot, .ops = &smc_inet6_stream_ops, .flags = INET_PROTOSW_ICSK, }; #endif /* CONFIG_IPV6 */ static unsigned int smc_sync_mss(struct sock *sk, u32 pmtu) { /* No need pass it through to clcsock, mss can always be set by * sock_create_kern or smc_setsockopt. */ return 0; } static int smc_inet_init_sock(struct sock *sk) { struct net *net = sock_net(sk); /* init common smc sock */ smc_sk_init(net, sk, IPPROTO_SMC); inet_csk(sk)->icsk_sync_mss = smc_sync_mss; /* create clcsock */ return smc_create_clcsk(net, sk, sk->sk_family); } int __init smc_inet_init(void) { int rc; rc = proto_register(&smc_inet_prot, 1); if (rc) { pr_err("%s: proto_register smc_inet_prot fails with %d\n", __func__, rc); return rc; } /* no return value */ inet_register_protosw(&smc_inet_protosw); #if IS_ENABLED(CONFIG_IPV6) rc = proto_register(&smc_inet6_prot, 1); if (rc) { pr_err("%s: proto_register smc_inet6_prot fails with %d\n", __func__, rc); goto out_inet6_prot; } rc = inet6_register_protosw(&smc_inet6_protosw); if (rc) { pr_err("%s: inet6_register_protosw smc_inet6_protosw fails with %d\n", __func__, rc); goto out_inet6_protosw; } return rc; out_inet6_protosw: proto_unregister(&smc_inet6_prot); out_inet6_prot: inet_unregister_protosw(&smc_inet_protosw); proto_unregister(&smc_inet_prot); #endif /* CONFIG_IPV6 */ return rc; } void smc_inet_exit(void) { #if IS_ENABLED(CONFIG_IPV6) inet6_unregister_protosw(&smc_inet6_protosw); proto_unregister(&smc_inet6_prot); #endif /* CONFIG_IPV6 */ inet_unregister_protosw(&smc_inet_protosw); proto_unregister(&smc_inet_prot); }
298 300 291 294 13 13 13 13 9907 9935 9903 9912 431 432 432 48 28 388 27 28 162 162 234 234 48 48 48 664 63 622 622 622 4723 4724 1179 1179 1525 1529 52 1471 584 585 1211 1214 1213 1458 1456 1455 1323 428 996 53 1324 1324 1323 6264 6264 6256 6259 6259 5595 1014 2033 2038 2039 2040 2021 3060 3060 3309 914 911 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor LSM hooks. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #include <linux/lsm_hooks.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/ptrace.h> #include <linux/ctype.h> #include <linux/sysctl.h> #include <linux/audit.h> #include <linux/user_namespace.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/zstd.h> #include <net/sock.h> #include <uapi/linux/mount.h> #include <uapi/linux/lsm.h> #include "include/apparmor.h" #include "include/apparmorfs.h" #include "include/audit.h" #include "include/capability.h" #include "include/cred.h" #include "include/file.h" #include "include/ipc.h" #include "include/net.h" #include "include/path.h" #include "include/label.h" #include "include/policy.h" #include "include/policy_ns.h" #include "include/procattr.h" #include "include/mount.h" #include "include/secid.h" /* Flag indicating whether initialization completed */ int apparmor_initialized; union aa_buffer { struct list_head list; DECLARE_FLEX_ARRAY(char, buffer); }; struct aa_local_cache { unsigned int hold; unsigned int count; struct list_head head; }; #define RESERVE_COUNT 2 static int reserve_count = RESERVE_COUNT; static int buffer_count; static LIST_HEAD(aa_global_buffers); static DEFINE_SPINLOCK(aa_buffers_lock); static DEFINE_PER_CPU(struct aa_local_cache, aa_local_buffers); /* * LSM hook functions */ /* * put the associated labels */ static void apparmor_cred_free(struct cred *cred) { aa_put_label(cred_label(cred)); set_cred_label(cred, NULL); } /* * allocate the apparmor part of blank credentials */ static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp) { set_cred_label(cred, NULL); return 0; } /* * prepare new cred label for modification by prepare_cred block */ static int apparmor_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { set_cred_label(new, aa_get_newest_label(cred_label(old))); return 0; } /* * transfer the apparmor data to a blank set of creds */ static void apparmor_cred_transfer(struct cred *new, const struct cred *old) { set_cred_label(new, aa_get_newest_label(cred_label(old))); } static void apparmor_task_free(struct task_struct *task) { aa_free_task_ctx(task_ctx(task)); } static int apparmor_task_alloc(struct task_struct *task, unsigned long clone_flags) { struct aa_task_ctx *new = task_ctx(task); aa_dup_task_ctx(new, task_ctx(current)); return 0; } static int apparmor_ptrace_access_check(struct task_struct *child, unsigned int mode) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; cred = get_task_cred(child); tracee = cred_label(cred); /* ref count on cred */ tracer = __begin_current_label_crit_section(); error = aa_may_ptrace(current_cred(), tracer, cred, tracee, (mode & PTRACE_MODE_READ) ? AA_PTRACE_READ : AA_PTRACE_TRACE); __end_current_label_crit_section(tracer); put_cred(cred); return error; } static int apparmor_ptrace_traceme(struct task_struct *parent) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; tracee = __begin_current_label_crit_section(); cred = get_task_cred(parent); tracer = cred_label(cred); /* ref count on cred */ error = aa_may_ptrace(cred, tracer, current_cred(), tracee, AA_PTRACE_TRACE); put_cred(cred); __end_current_label_crit_section(tracee); return error; } /* Derived from security/commoncap.c:cap_capget */ static int apparmor_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { struct aa_label *label; const struct cred *cred; rcu_read_lock(); cred = __task_cred(target); label = aa_get_newest_cred_label(cred); /* * cap_capget is stacked ahead of this and will * initialize effective and permitted. */ if (!unconfined(label)) { struct aa_profile *profile; struct label_it i; label_for_each_confined(i, label, profile) { struct aa_ruleset *rules; if (COMPLAIN_MODE(profile)) continue; rules = list_first_entry(&profile->rules, typeof(*rules), list); *effective = cap_intersect(*effective, rules->caps.allow); *permitted = cap_intersect(*permitted, rules->caps.allow); } } rcu_read_unlock(); aa_put_label(label); return 0; } static int apparmor_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { struct aa_label *label; int error = 0; label = aa_get_newest_cred_label(cred); if (!unconfined(label)) error = aa_capable(cred, label, cap, opts); aa_put_label(label); return error; } /** * common_perm - basic common permission check wrapper fn for paths * @op: operation being checked * @path: path to check permission of (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm(const char *op, const struct path *path, u32 mask, struct path_cond *cond) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_path_perm(op, current_cred(), label, path, 0, mask, cond); __end_current_label_crit_section(label); return error; } /** * common_perm_cond - common permission wrapper around inode cond * @op: operation being checked * @path: location to check (NOT NULL) * @mask: requested permissions mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_cond(const char *op, const struct path *path, u32 mask) { vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(path->mnt), d_backing_inode(path->dentry)); struct path_cond cond = { vfsuid_into_kuid(vfsuid), d_backing_inode(path->dentry)->i_mode }; if (!path_mediated_fs(path->dentry)) return 0; return common_perm(op, path, mask, &cond); } /** * common_perm_dir_dentry - common permission wrapper when path is dir, dentry * @op: operation being checked * @dir: directory of the dentry (NOT NULL) * @dentry: dentry to check (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm_dir_dentry(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, struct path_cond *cond) { struct path path = { .mnt = dir->mnt, .dentry = dentry }; return common_perm(op, &path, mask, cond); } /** * common_perm_rm - common permission wrapper for operations doing rm * @op: operation being checked * @dir: directory that the dentry is in (NOT NULL) * @dentry: dentry being rm'd (NOT NULL) * @mask: requested permission mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_rm(const char *op, const struct path *dir, struct dentry *dentry, u32 mask) { struct inode *inode = d_backing_inode(dentry); struct path_cond cond = { }; vfsuid_t vfsuid; if (!inode || !path_mediated_fs(dentry)) return 0; vfsuid = i_uid_into_vfsuid(mnt_idmap(dir->mnt), inode); cond.uid = vfsuid_into_kuid(vfsuid); cond.mode = inode->i_mode; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } /** * common_perm_create - common permission wrapper for operations doing create * @op: operation being checked * @dir: directory that dentry will be created in (NOT NULL) * @dentry: dentry to create (NOT NULL) * @mask: request permission mask * @mode: created file mode * * Returns: %0 else error code if error or permission denied */ static int common_perm_create(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, umode_t mode) { struct path_cond cond = { current_fsuid(), mode }; if (!path_mediated_fs(dir->dentry)) return 0; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } static int apparmor_path_unlink(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_UNLINK, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) { return common_perm_create(OP_MKDIR, dir, dentry, AA_MAY_CREATE, S_IFDIR); } static int apparmor_path_rmdir(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_RMDIR, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { return common_perm_create(OP_MKNOD, dir, dentry, AA_MAY_CREATE, mode); } static int apparmor_path_truncate(const struct path *path) { return common_perm_cond(OP_TRUNC, path, MAY_WRITE | AA_MAY_SETATTR); } static int apparmor_file_truncate(struct file *file) { return apparmor_path_truncate(&file->f_path); } static int apparmor_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { return common_perm_create(OP_SYMLINK, dir, dentry, AA_MAY_CREATE, S_IFLNK); } static int apparmor_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) error = aa_path_link(current_cred(), label, old_dentry, new_dir, new_dentry); end_current_label_crit_section(label); return error; } static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, const unsigned int flags) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; if ((flags & RENAME_EXCHANGE) && !path_mediated_fs(new_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) { struct mnt_idmap *idmap = mnt_idmap(old_dir->mnt); vfsuid_t vfsuid; struct path old_path = { .mnt = old_dir->mnt, .dentry = old_dentry }; struct path new_path = { .mnt = new_dir->mnt, .dentry = new_dentry }; struct path_cond cond = { .mode = d_backing_inode(old_dentry)->i_mode }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond.uid = vfsuid_into_kuid(vfsuid); if (flags & RENAME_EXCHANGE) { struct path_cond cond_exchange = { .mode = d_backing_inode(new_dentry)->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond_exchange.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &new_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond_exchange); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &old_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond_exchange); } if (!error) error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &old_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &new_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond); } end_current_label_crit_section(label); return error; } static int apparmor_path_chmod(const struct path *path, umode_t mode) { return common_perm_cond(OP_CHMOD, path, AA_MAY_CHMOD); } static int apparmor_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { return common_perm_cond(OP_CHOWN, path, AA_MAY_CHOWN); } static int apparmor_inode_getattr(const struct path *path) { return common_perm_cond(OP_GETATTR, path, AA_MAY_GETATTR); } static int apparmor_file_open(struct file *file) { struct aa_file_ctx *fctx = file_ctx(file); struct aa_label *label; int error = 0; bool needput; if (!path_mediated_fs(file->f_path.dentry)) return 0; /* If in exec, permission is handled by bprm hooks. * Cache permissions granted by the previous exec check, with * implicit read and executable mmap which are required to * actually execute the image. * * Illogically, FMODE_EXEC is in f_flags, not f_mode. */ if (file->f_flags & __FMODE_EXEC) { fctx->allow = MAY_EXEC | MAY_READ | AA_EXEC_MMAP; return 0; } label = aa_get_newest_cred_label_condref(file->f_cred, &needput); if (!unconfined(label)) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); vfsuid_t vfsuid; struct path_cond cond = { .mode = inode->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, inode); cond.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_OPEN, file->f_cred, label, &file->f_path, 0, aa_map_file_to_perms(file), &cond); /* todo cache full allowed permissions set and state */ fctx->allow = aa_map_file_to_perms(file); } aa_put_label_condref(label, needput); return error; } static int apparmor_file_alloc_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); struct aa_label *label = begin_current_label_crit_section(); spin_lock_init(&ctx->lock); rcu_assign_pointer(ctx->label, aa_get_label(label)); end_current_label_crit_section(label); return 0; } static void apparmor_file_free_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); if (ctx) aa_put_label(rcu_access_pointer(ctx->label)); } static int common_file_perm(const char *op, struct file *file, u32 mask, bool in_atomic) { struct aa_label *label; int error = 0; /* don't reaudit files closed during inheritance */ if (file->f_path.dentry == aa_null.dentry) return -EACCES; label = __begin_current_label_crit_section(); error = aa_file_perm(op, current_cred(), label, file, mask, in_atomic); __end_current_label_crit_section(label); return error; } static int apparmor_file_receive(struct file *file) { return common_file_perm(OP_FRECEIVE, file, aa_map_file_to_perms(file), false); } static int apparmor_file_permission(struct file *file, int mask) { return common_file_perm(OP_FPERM, file, mask, false); } static int apparmor_file_lock(struct file *file, unsigned int cmd) { u32 mask = AA_MAY_LOCK; if (cmd == F_WRLCK) mask |= MAY_WRITE; return common_file_perm(OP_FLOCK, file, mask, false); } static int common_mmap(const char *op, struct file *file, unsigned long prot, unsigned long flags, bool in_atomic) { int mask = 0; if (!file || !file_ctx(file)) return 0; if (prot & PROT_READ) mask |= MAY_READ; /* * Private mappings don't require write perms since they don't * write back to the files */ if ((prot & PROT_WRITE) && !(flags & MAP_PRIVATE)) mask |= MAY_WRITE; if (prot & PROT_EXEC) mask |= AA_EXEC_MMAP; return common_file_perm(op, file, mask, in_atomic); } static int apparmor_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { return common_mmap(OP_FMMAP, file, prot, flags, GFP_ATOMIC); } static int apparmor_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { return common_mmap(OP_FMPROT, vma->vm_file, prot, !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0, false); } #ifdef CONFIG_IO_URING static const char *audit_uring_mask(u32 mask) { if (mask & AA_MAY_CREATE_SQPOLL) return "sqpoll"; if (mask & AA_MAY_OVERRIDE_CRED) return "override_creds"; return ""; } static void audit_uring_cb(struct audit_buffer *ab, void *va) { struct apparmor_audit_data *ad = aad_of_va(va); if (ad->request & AA_URING_PERM_MASK) { audit_log_format(ab, " requested=\"%s\"", audit_uring_mask(ad->request)); if (ad->denied & AA_URING_PERM_MASK) { audit_log_format(ab, " denied=\"%s\"", audit_uring_mask(ad->denied)); } } if (ad->uring.target) { audit_log_format(ab, " tcontext="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->uring.target, FLAGS_NONE, GFP_ATOMIC); } } static int profile_uring(struct aa_profile *profile, u32 request, struct aa_label *new, int cap, struct apparmor_audit_data *ad) { unsigned int state; struct aa_ruleset *rules; int error = 0; AA_BUG(!profile); rules = list_first_entry(&profile->rules, typeof(*rules), list); state = RULE_MEDIATES(rules, AA_CLASS_IO_URING); if (state) { struct aa_perms perms = { }; if (new) { aa_label_match(profile, rules, new, state, false, request, &perms); } else { perms = *aa_lookup_perms(rules->policy, state); } aa_apply_modes_to_perms(profile, &perms); error = aa_check_perms(profile, &perms, request, ad, audit_uring_cb); } return error; } /** * apparmor_uring_override_creds - check the requested cred override * @new: the target creds * * Check to see if the current task is allowed to override it's credentials * to service an io_uring operation. */ static int apparmor_uring_override_creds(const struct cred *new) { struct aa_profile *profile; struct aa_label *label; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_OVERRIDE); ad.uring.target = cred_label(new); label = __begin_current_label_crit_section(); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_OVERRIDE_CRED, cred_label(new), CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label); return error; } /** * apparmor_uring_sqpoll - check if a io_uring polling thread can be created * * Check to see if the current task is allowed to create a new io_uring * kernel polling thread. */ static int apparmor_uring_sqpoll(void) { struct aa_profile *profile; struct aa_label *label; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_SQPOLL); label = __begin_current_label_crit_section(); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_CREATE_SQPOLL, NULL, CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label); return error; } #endif /* CONFIG_IO_URING */ static int apparmor_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { struct aa_label *label; int error = 0; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; flags &= ~AA_MS_IGNORE_MASK; label = __begin_current_label_crit_section(); if (!unconfined(label)) { if (flags & MS_REMOUNT) error = aa_remount(current_cred(), label, path, flags, data); else if (flags & MS_BIND) error = aa_bind_mount(current_cred(), label, path, dev_name, flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) error = aa_mount_change_type(current_cred(), label, path, flags); else if (flags & MS_MOVE) error = aa_move_mount_old(current_cred(), label, path, dev_name); else error = aa_new_mount(current_cred(), label, dev_name, path, type, flags, data); } __end_current_label_crit_section(label); return error; } static int apparmor_move_mount(const struct path *from_path, const struct path *to_path) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_move_mount(current_cred(), label, from_path, to_path); __end_current_label_crit_section(label); return error; } static int apparmor_sb_umount(struct vfsmount *mnt, int flags) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_umount(current_cred(), label, mnt, flags); __end_current_label_crit_section(label); return error; } static int apparmor_sb_pivotroot(const struct path *old_path, const struct path *new_path) { struct aa_label *label; int error = 0; label = aa_get_current_label(); if (!unconfined(label)) error = aa_pivotroot(current_cred(), label, old_path, new_path); aa_put_label(label); return error; } static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx, u32 *size, u32 flags) { int error = -ENOENT; struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; char *value = NULL; switch (attr) { case LSM_ATTR_CURRENT: label = aa_get_newest_label(cred_label(current_cred())); break; case LSM_ATTR_PREV: if (ctx->previous) label = aa_get_newest_label(ctx->previous); break; case LSM_ATTR_EXEC: if (ctx->onexec) label = aa_get_newest_label(ctx->onexec); break; default: error = -EOPNOTSUPP; break; } if (label) { error = aa_getprocattr(label, &value, false); if (error > 0) error = lsm_fill_user_ctx(lx, size, value, error, LSM_ID_APPARMOR, 0); kfree(value); } aa_put_label(label); if (error < 0) return error; return 1; } static int apparmor_getprocattr(struct task_struct *task, const char *name, char **value) { int error = -ENOENT; /* released below */ const struct cred *cred = get_task_cred(task); struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; if (strcmp(name, "current") == 0) label = aa_get_newest_label(cred_label(cred)); else if (strcmp(name, "prev") == 0 && ctx->previous) label = aa_get_newest_label(ctx->previous); else if (strcmp(name, "exec") == 0 && ctx->onexec) label = aa_get_newest_label(ctx->onexec); else error = -EINVAL; if (label) error = aa_getprocattr(label, value, true); aa_put_label(label); put_cred(cred); return error; } static int do_setattr(u64 attr, void *value, size_t size) { char *command, *largs = NULL, *args = value; size_t arg_size; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE, OP_SETPROCATTR); if (size == 0) return -EINVAL; /* AppArmor requires that the buffer must be null terminated atm */ if (args[size - 1] != '\0') { /* null terminate */ largs = args = kmalloc(size + 1, GFP_KERNEL); if (!args) return -ENOMEM; memcpy(args, value, size); args[size] = '\0'; } error = -EINVAL; args = strim(args); command = strsep(&args, " "); if (!args) goto out; args = skip_spaces(args); if (!*args) goto out; arg_size = size - (args - (largs ? largs : (char *) value)); if (attr == LSM_ATTR_CURRENT) { if (strcmp(command, "changehat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permhat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_TEST); } else if (strcmp(command, "changeprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_TEST); } else if (strcmp(command, "stack") == 0) { error = aa_change_profile(args, AA_CHANGE_STACK); } else goto fail; } else if (attr == LSM_ATTR_EXEC) { if (strcmp(command, "exec") == 0) error = aa_change_profile(args, AA_CHANGE_ONEXEC); else if (strcmp(command, "stack") == 0) error = aa_change_profile(args, (AA_CHANGE_ONEXEC | AA_CHANGE_STACK)); else goto fail; } else /* only support the "current" and "exec" process attributes */ goto fail; if (!error) error = size; out: kfree(largs); return error; fail: ad.subj_label = begin_current_label_crit_section(); if (attr == LSM_ATTR_CURRENT) ad.info = "current"; else if (attr == LSM_ATTR_EXEC) ad.info = "exec"; else ad.info = "invalid"; ad.error = error = -EINVAL; aa_audit_msg(AUDIT_APPARMOR_DENIED, &ad, NULL); end_current_label_crit_section(ad.subj_label); goto out; } static int apparmor_setselfattr(unsigned int attr, struct lsm_ctx *ctx, u32 size, u32 flags) { int rc; if (attr != LSM_ATTR_CURRENT && attr != LSM_ATTR_EXEC) return -EOPNOTSUPP; rc = do_setattr(attr, ctx->ctx, ctx->ctx_len); if (rc > 0) return 0; return rc; } static int apparmor_setprocattr(const char *name, void *value, size_t size) { int attr = lsm_name_to_attr(name); if (attr) return do_setattr(attr, value, size); return -EINVAL; } /** * apparmor_bprm_committing_creds - do task cleanup on committing new creds * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committing_creds(const struct linux_binprm *bprm) { struct aa_label *label = aa_current_raw_label(); struct aa_label *new_label = cred_label(bprm->cred); /* bail out if unconfined or not changing profile */ if ((new_label->proxy == label->proxy) || (unconfined(new_label))) return; aa_inherit_files(bprm->cred, current->files); current->pdeath_signal = 0; /* reset soft limits and set hard limits for the new label */ __aa_transition_rlimits(label, new_label); } /** * apparmor_bprm_committed_creds() - do cleanup after new creds committed * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm) { /* clear out temporary/transitional state from the context */ aa_clear_task_ctx_trans(task_ctx(current)); return; } static void apparmor_current_getlsmprop_subj(struct lsm_prop *prop) { struct aa_label *label = __begin_current_label_crit_section(); prop->apparmor.label = label; __end_current_label_crit_section(label); } static void apparmor_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop) { struct aa_label *label = aa_get_task_label(p); prop->apparmor.label = label; aa_put_label(label); } static int apparmor_task_setrlimit(struct task_struct *task, unsigned int resource, struct rlimit *new_rlim) { struct aa_label *label = __begin_current_label_crit_section(); int error = 0; if (!unconfined(label)) error = aa_task_setrlimit(current_cred(), label, task, resource, new_rlim); __end_current_label_crit_section(label); return error; } static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo *info, int sig, const struct cred *cred) { const struct cred *tc; struct aa_label *cl, *tl; int error; tc = get_task_cred(target); tl = aa_get_newest_cred_label(tc); if (cred) { /* * Dealing with USB IO specific behavior */ cl = aa_get_newest_cred_label(cred); error = aa_may_signal(cred, cl, tc, tl, sig); aa_put_label(cl); } else { cl = __begin_current_label_crit_section(); error = aa_may_signal(current_cred(), cl, tc, tl, sig); __end_current_label_crit_section(cl); } aa_put_label(tl); put_cred(tc); return error; } static int apparmor_userns_create(const struct cred *cred) { struct aa_label *label; struct aa_profile *profile; int error = 0; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_NS, OP_USERNS_CREATE); ad.subj_cred = current_cred(); label = begin_current_label_crit_section(); if (!unconfined(label)) { error = fn_for_each(label, profile, aa_profile_ns_perm(profile, &ad, AA_USERNS_CREATE)); } end_current_label_crit_section(label); return error; } static void apparmor_sk_free_security(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); aa_put_label(ctx->label); aa_put_label(ctx->peer); } /** * apparmor_sk_clone_security - clone the sk_security field * @sk: sock to have security cloned * @newsk: sock getting clone */ static void apparmor_sk_clone_security(const struct sock *sk, struct sock *newsk) { struct aa_sk_ctx *ctx = aa_sock(sk); struct aa_sk_ctx *new = aa_sock(newsk); if (new->label) aa_put_label(new->label); new->label = aa_get_label(ctx->label); if (new->peer) aa_put_label(new->peer); new->peer = aa_get_label(ctx->peer); } static int apparmor_socket_create(int family, int type, int protocol, int kern) { struct aa_label *label; int error = 0; AA_BUG(in_interrupt()); label = begin_current_label_crit_section(); if (!(kern || unconfined(label))) error = af_select(family, create_perm(label, family, type, protocol), aa_af_perm(current_cred(), label, OP_CREATE, AA_MAY_CREATE, family, type, protocol)); end_current_label_crit_section(label); return error; } /** * apparmor_socket_post_create - setup the per-socket security struct * @sock: socket that is being setup * @family: family of socket being created * @type: type of the socket * @protocol: protocol of the socket * @kern: socket is a special kernel socket * * Note: * - kernel sockets labeled kernel_t used to use unconfined * - socket may not have sk here if created with sock_create_lite or * sock_alloc. These should be accept cases which will be handled in * sock_graft. */ static int apparmor_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { struct aa_label *label; if (kern) { label = aa_get_label(kernel_t); } else label = aa_get_current_label(); if (sock->sk) { struct aa_sk_ctx *ctx = aa_sock(sock->sk); aa_put_label(ctx->label); ctx->label = aa_get_label(label); } aa_put_label(label); return 0; } static int apparmor_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, bind_perm(sock, address, addrlen), aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk)); } static int apparmor_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, connect_perm(sock, address, addrlen), aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk)); } static int apparmor_socket_listen(struct socket *sock, int backlog) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, listen_perm(sock, backlog), aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk)); } /* * Note: while @newsock is created and has some information, the accept * has not been done. */ static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!newsock); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, accept_perm(sock, newsock), aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk)); } static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock, struct msghdr *msg, int size) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!msg); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, msg_perm(op, request, sock, msg, size), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return aa_sock_msg_perm(OP_SENDMSG, AA_MAY_SEND, sock, msg, size); } static int apparmor_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return aa_sock_msg_perm(OP_RECVMSG, AA_MAY_RECEIVE, sock, msg, size); } /* revaliation, get/set attr, shutdown */ static int aa_sock_perm(const char *op, u32 request, struct socket *sock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, sock_perm(op, request, sock), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_getsockname(struct socket *sock) { return aa_sock_perm(OP_GETSOCKNAME, AA_MAY_GETATTR, sock); } static int apparmor_socket_getpeername(struct socket *sock) { return aa_sock_perm(OP_GETPEERNAME, AA_MAY_GETATTR, sock); } /* revaliation, get/set attr, opt */ static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock, int level, int optname) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, opt_perm(op, request, sock, level, optname), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_getsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_GETSOCKOPT, AA_MAY_GETOPT, sock, level, optname); } static int apparmor_socket_setsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_SETSOCKOPT, AA_MAY_SETOPT, sock, level, optname); } static int apparmor_socket_shutdown(struct socket *sock, int how) { return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock); } #ifdef CONFIG_NETWORK_SECMARK /** * apparmor_socket_sock_rcv_skb - check perms before associating skb to sk * @sk: sk to associate @skb with * @skb: skb to check for perms * * Note: can not sleep may be called with locks held * * dont want protocol specific in __skb_recv_datagram() * to deny an incoming connection socket_sock_rcv_skb() */ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!skb->secmark) return 0; /* * If reach here before socket_post_create hook is called, in which * case label is null, drop the packet. */ if (!ctx->label) return -EACCES; return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE, skb->secmark, sk); } #endif static struct aa_label *sk_peer_label(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); if (ctx->peer) return ctx->peer; return ERR_PTR(-ENOPROTOOPT); } /** * apparmor_socket_getpeersec_stream - get security context of peer * @sock: socket that we are trying to get the peer context of * @optval: output - buffer to copy peer name to * @optlen: output - size of copied name in @optval * @len: size of @optval buffer * Returns: 0 on success, -errno of failure * * Note: for tcp only valid if using ipsec or cipso on lan */ static int apparmor_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { char *name = NULL; int slen, error = 0; struct aa_label *label; struct aa_label *peer; label = begin_current_label_crit_section(); peer = sk_peer_label(sock->sk); if (IS_ERR(peer)) { error = PTR_ERR(peer); goto done; } slen = aa_label_asxprint(&name, labels_ns(label), peer, FLAG_SHOW_MODE | FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED, GFP_KERNEL); /* don't include terminating \0 in slen, it breaks some apps */ if (slen < 0) { error = -ENOMEM; goto done; } if (slen > len) { error = -ERANGE; goto done_len; } if (copy_to_sockptr(optval, name, slen)) error = -EFAULT; done_len: if (copy_to_sockptr(optlen, &slen, sizeof(slen))) error = -EFAULT; done: end_current_label_crit_section(label); kfree(name); return error; } /** * apparmor_socket_getpeersec_dgram - get security label of packet * @sock: the peer socket * @skb: packet data * @secid: pointer to where to put the secid of the packet * * Sets the netlabel socket state on sk from parent */ static int apparmor_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { /* TODO: requires secid support */ return -ENOPROTOOPT; } /** * apparmor_sock_graft - Initialize newly created socket * @sk: child sock * @parent: parent socket * * Note: could set off of SOCK_CTX(parent) but need to track inode and we can * just set sk security information off of current creating process label * Labeling of sk for accept case - probably should be sock based * instead of task, because of the case where an implicitly labeled * socket is shared by different tasks. */ static void apparmor_sock_graft(struct sock *sk, struct socket *parent) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!ctx->label) ctx->label = aa_get_current_label(); } #ifdef CONFIG_NETWORK_SECMARK static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!skb->secmark) return 0; return apparmor_secmark_check(ctx->label, OP_CONNECT, AA_MAY_CONNECT, skb->secmark, sk); } #endif /* * The cred blob is a pointer to, not an instance of, an aa_label. */ struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct aa_label *), .lbs_file = sizeof(struct aa_file_ctx), .lbs_task = sizeof(struct aa_task_ctx), .lbs_sock = sizeof(struct aa_sk_ctx), }; static const struct lsm_id apparmor_lsmid = { .name = "apparmor", .id = LSM_ID_APPARMOR, }; static struct security_hook_list apparmor_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), LSM_HOOK_INIT(capget, apparmor_capget), LSM_HOOK_INIT(capable, apparmor_capable), LSM_HOOK_INIT(move_mount, apparmor_move_mount), LSM_HOOK_INIT(sb_mount, apparmor_sb_mount), LSM_HOOK_INIT(sb_umount, apparmor_sb_umount), LSM_HOOK_INIT(sb_pivotroot, apparmor_sb_pivotroot), LSM_HOOK_INIT(path_link, apparmor_path_link), LSM_HOOK_INIT(path_unlink, apparmor_path_unlink), LSM_HOOK_INIT(path_symlink, apparmor_path_symlink), LSM_HOOK_INIT(path_mkdir, apparmor_path_mkdir), LSM_HOOK_INIT(path_rmdir, apparmor_path_rmdir), LSM_HOOK_INIT(path_mknod, apparmor_path_mknod), LSM_HOOK_INIT(path_rename, apparmor_path_rename), LSM_HOOK_INIT(path_chmod, apparmor_path_chmod), LSM_HOOK_INIT(path_chown, apparmor_path_chown), LSM_HOOK_INIT(path_truncate, apparmor_path_truncate), LSM_HOOK_INIT(inode_getattr, apparmor_inode_getattr), LSM_HOOK_INIT(file_open, apparmor_file_open), LSM_HOOK_INIT(file_receive, apparmor_file_receive), LSM_HOOK_INIT(file_permission, apparmor_file_permission), LSM_HOOK_INIT(file_alloc_security, apparmor_file_alloc_security), LSM_HOOK_INIT(file_free_security, apparmor_file_free_security), LSM_HOOK_INIT(mmap_file, apparmor_mmap_file), LSM_HOOK_INIT(file_mprotect, apparmor_file_mprotect), LSM_HOOK_INIT(file_lock, apparmor_file_lock), LSM_HOOK_INIT(file_truncate, apparmor_file_truncate), LSM_HOOK_INIT(getselfattr, apparmor_getselfattr), LSM_HOOK_INIT(setselfattr, apparmor_setselfattr), LSM_HOOK_INIT(getprocattr, apparmor_getprocattr), LSM_HOOK_INIT(setprocattr, apparmor_setprocattr), LSM_HOOK_INIT(sk_free_security, apparmor_sk_free_security), LSM_HOOK_INIT(sk_clone_security, apparmor_sk_clone_security), LSM_HOOK_INIT(socket_create, apparmor_socket_create), LSM_HOOK_INIT(socket_post_create, apparmor_socket_post_create), LSM_HOOK_INIT(socket_bind, apparmor_socket_bind), LSM_HOOK_INIT(socket_connect, apparmor_socket_connect), LSM_HOOK_INIT(socket_listen, apparmor_socket_listen), LSM_HOOK_INIT(socket_accept, apparmor_socket_accept), LSM_HOOK_INIT(socket_sendmsg, apparmor_socket_sendmsg), LSM_HOOK_INIT(socket_recvmsg, apparmor_socket_recvmsg), LSM_HOOK_INIT(socket_getsockname, apparmor_socket_getsockname), LSM_HOOK_INIT(socket_getpeername, apparmor_socket_getpeername), LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt), LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt), LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb), #endif LSM_HOOK_INIT(socket_getpeersec_stream, apparmor_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, apparmor_socket_getpeersec_dgram), LSM_HOOK_INIT(sock_graft, apparmor_sock_graft), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(inet_conn_request, apparmor_inet_conn_request), #endif LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank), LSM_HOOK_INIT(cred_free, apparmor_cred_free), LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare), LSM_HOOK_INIT(cred_transfer, apparmor_cred_transfer), LSM_HOOK_INIT(bprm_creds_for_exec, apparmor_bprm_creds_for_exec), LSM_HOOK_INIT(bprm_committing_creds, apparmor_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, apparmor_bprm_committed_creds), LSM_HOOK_INIT(task_free, apparmor_task_free), LSM_HOOK_INIT(task_alloc, apparmor_task_alloc), LSM_HOOK_INIT(current_getlsmprop_subj, apparmor_current_getlsmprop_subj), LSM_HOOK_INIT(task_getlsmprop_obj, apparmor_task_getlsmprop_obj), LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit), LSM_HOOK_INIT(task_kill, apparmor_task_kill), LSM_HOOK_INIT(userns_create, apparmor_userns_create), #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, aa_audit_rule_init), LSM_HOOK_INIT(audit_rule_known, aa_audit_rule_known), LSM_HOOK_INIT(audit_rule_match, aa_audit_rule_match), LSM_HOOK_INIT(audit_rule_free, aa_audit_rule_free), #endif LSM_HOOK_INIT(secid_to_secctx, apparmor_secid_to_secctx), LSM_HOOK_INIT(lsmprop_to_secctx, apparmor_lsmprop_to_secctx), LSM_HOOK_INIT(secctx_to_secid, apparmor_secctx_to_secid), LSM_HOOK_INIT(release_secctx, apparmor_release_secctx), #ifdef CONFIG_IO_URING LSM_HOOK_INIT(uring_override_creds, apparmor_uring_override_creds), LSM_HOOK_INIT(uring_sqpoll, apparmor_uring_sqpoll), #endif }; /* * AppArmor sysfs module parameters */ static int param_set_aabool(const char *val, const struct kernel_param *kp); static int param_get_aabool(char *buffer, const struct kernel_param *kp); #define param_check_aabool param_check_bool static const struct kernel_param_ops param_ops_aabool = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aabool, .get = param_get_aabool }; static int param_set_aauint(const char *val, const struct kernel_param *kp); static int param_get_aauint(char *buffer, const struct kernel_param *kp); #define param_check_aauint param_check_uint static const struct kernel_param_ops param_ops_aauint = { .set = param_set_aauint, .get = param_get_aauint }; static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp); static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp); #define param_check_aacompressionlevel param_check_int static const struct kernel_param_ops param_ops_aacompressionlevel = { .set = param_set_aacompressionlevel, .get = param_get_aacompressionlevel }; static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp); static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp); #define param_check_aalockpolicy param_check_bool static const struct kernel_param_ops param_ops_aalockpolicy = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aalockpolicy, .get = param_get_aalockpolicy }; static int param_set_audit(const char *val, const struct kernel_param *kp); static int param_get_audit(char *buffer, const struct kernel_param *kp); static int param_set_mode(const char *val, const struct kernel_param *kp); static int param_get_mode(char *buffer, const struct kernel_param *kp); /* Flag values, also controllable via /sys/module/apparmor/parameters * We define special types as we want to do additional mediation. */ /* AppArmor global enforcement switch - complain, enforce, kill */ enum profile_mode aa_g_profile_mode = APPARMOR_ENFORCE; module_param_call(mode, param_set_mode, param_get_mode, &aa_g_profile_mode, S_IRUSR | S_IWUSR); /* whether policy verification hashing is enabled */ bool aa_g_hash_policy = IS_ENABLED(CONFIG_SECURITY_APPARMOR_HASH_DEFAULT); #ifdef CONFIG_SECURITY_APPARMOR_HASH module_param_named(hash_policy, aa_g_hash_policy, aabool, S_IRUSR | S_IWUSR); #endif /* whether policy exactly as loaded is retained for debug and checkpointing */ bool aa_g_export_binary = IS_ENABLED(CONFIG_SECURITY_APPARMOR_EXPORT_BINARY); #ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY module_param_named(export_binary, aa_g_export_binary, aabool, 0600); #endif /* policy loaddata compression level */ int aa_g_rawdata_compression_level = AA_DEFAULT_CLEVEL; module_param_named(rawdata_compression_level, aa_g_rawdata_compression_level, aacompressionlevel, 0400); /* Debug mode */ bool aa_g_debug = IS_ENABLED(CONFIG_SECURITY_APPARMOR_DEBUG_MESSAGES); module_param_named(debug, aa_g_debug, aabool, S_IRUSR | S_IWUSR); /* Audit mode */ enum audit_mode aa_g_audit; module_param_call(audit, param_set_audit, param_get_audit, &aa_g_audit, S_IRUSR | S_IWUSR); /* Determines if audit header is included in audited messages. This * provides more context if the audit daemon is not running */ bool aa_g_audit_header = true; module_param_named(audit_header, aa_g_audit_header, aabool, S_IRUSR | S_IWUSR); /* lock out loading/removal of policy * TODO: add in at boot loading of policy, which is the only way to * load policy, if lock_policy is set */ bool aa_g_lock_policy; module_param_named(lock_policy, aa_g_lock_policy, aalockpolicy, S_IRUSR | S_IWUSR); /* Syscall logging mode */ bool aa_g_logsyscall; module_param_named(logsyscall, aa_g_logsyscall, aabool, S_IRUSR | S_IWUSR); /* Maximum pathname length before accesses will start getting rejected */ unsigned int aa_g_path_max = 2 * PATH_MAX; module_param_named(path_max, aa_g_path_max, aauint, S_IRUSR); /* Determines how paranoid loading of policy is and how much verification * on the loaded policy is done. * DEPRECATED: read only as strict checking of load is always done now * that none root users (user namespaces) can load policy. */ bool aa_g_paranoid_load = IS_ENABLED(CONFIG_SECURITY_APPARMOR_PARANOID_LOAD); module_param_named(paranoid_load, aa_g_paranoid_load, aabool, S_IRUGO); static int param_get_aaintbool(char *buffer, const struct kernel_param *kp); static int param_set_aaintbool(const char *val, const struct kernel_param *kp); #define param_check_aaintbool param_check_int static const struct kernel_param_ops param_ops_aaintbool = { .set = param_set_aaintbool, .get = param_get_aaintbool }; /* Boot time disable flag */ static int apparmor_enabled __ro_after_init = 1; module_param_named(enabled, apparmor_enabled, aaintbool, 0444); static int __init apparmor_enabled_setup(char *str) { unsigned long enabled; int error = kstrtoul(str, 0, &enabled); if (!error) apparmor_enabled = enabled ? 1 : 0; return 1; } __setup("apparmor=", apparmor_enabled_setup); /* set global flag turning off the ability to load policy */ static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aabool(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aabool(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aauint(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; /* file is ro but enforce 2nd line check */ if (apparmor_initialized) return -EPERM; error = param_set_uint(val, kp); aa_g_path_max = max_t(uint32_t, aa_g_path_max, sizeof(union aa_buffer)); pr_info("AppArmor: buffer size set to %d bytes\n", aa_g_path_max); return error; } static int param_get_aauint(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_uint(buffer, kp); } /* Can only be set before AppArmor is initialized (i.e. on boot cmdline). */ static int param_set_aaintbool(const char *val, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; int error; if (apparmor_initialized) return -EPERM; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; error = param_set_bool(val, &kp_local); if (!error) *((int *)kp->arg) = *((bool *)kp_local.arg); return error; } /* * To avoid changing /sys/module/apparmor/parameters/enabled from Y/N to * 1/0, this converts the "int that is actually bool" back to bool for * display in the /sys filesystem, while keeping it "int" for the LSM * infrastructure. */ static int param_get_aaintbool(char *buffer, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; return param_get_bool(buffer, &kp_local); } static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized) return -EPERM; error = param_set_int(val, kp); aa_g_rawdata_compression_level = clamp(aa_g_rawdata_compression_level, AA_MIN_CLEVEL, AA_MAX_CLEVEL); pr_info("AppArmor: policy rawdata compression level set to %d\n", aa_g_rawdata_compression_level); return error; } static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_int(buffer, kp); } static int param_get_audit(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", audit_mode_names[aa_g_audit]); } static int param_set_audit(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(audit_mode_names, AUDIT_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_audit = i; return 0; } static int param_get_mode(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", aa_profile_mode_names[aa_g_profile_mode]); } static int param_set_mode(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(aa_profile_mode_names, APPARMOR_MODE_NAMES_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_profile_mode = i; return 0; } char *aa_get_buffer(bool in_atomic) { union aa_buffer *aa_buf; struct aa_local_cache *cache; bool try_again = true; gfp_t flags = (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); /* use per cpu cached buffers first */ cache = get_cpu_ptr(&aa_local_buffers); if (!list_empty(&cache->head)) { aa_buf = list_first_entry(&cache->head, union aa_buffer, list); list_del(&aa_buf->list); cache->hold--; cache->count--; put_cpu_ptr(&aa_local_buffers); return &aa_buf->buffer[0]; } put_cpu_ptr(&aa_local_buffers); if (!spin_trylock(&aa_buffers_lock)) { cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; put_cpu_ptr(&aa_local_buffers); spin_lock(&aa_buffers_lock); } else { cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); } retry: if (buffer_count > reserve_count || (in_atomic && !list_empty(&aa_global_buffers))) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); buffer_count--; spin_unlock(&aa_buffers_lock); return aa_buf->buffer; } if (in_atomic) { /* * out of reserve buffers and in atomic context so increase * how many buffers to keep in reserve */ reserve_count++; flags = GFP_ATOMIC; } spin_unlock(&aa_buffers_lock); if (!in_atomic) might_sleep(); aa_buf = kmalloc(aa_g_path_max, flags); if (!aa_buf) { if (try_again) { try_again = false; spin_lock(&aa_buffers_lock); goto retry; } pr_warn_once("AppArmor: Failed to allocate a memory buffer.\n"); return NULL; } return aa_buf->buffer; } void aa_put_buffer(char *buf) { union aa_buffer *aa_buf; struct aa_local_cache *cache; if (!buf) return; aa_buf = container_of(buf, union aa_buffer, buffer[0]); cache = get_cpu_ptr(&aa_local_buffers); if (!cache->hold) { put_cpu_ptr(&aa_local_buffers); if (spin_trylock(&aa_buffers_lock)) { /* put back on global list */ list_add(&aa_buf->list, &aa_global_buffers); buffer_count++; spin_unlock(&aa_buffers_lock); cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); return; } /* contention on global list, fallback to percpu */ cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; } /* cache in percpu list */ list_add(&aa_buf->list, &cache->head); cache->count++; put_cpu_ptr(&aa_local_buffers); } /* * AppArmor init functions */ /** * set_init_ctx - set a task context and profile on the first task. * * TODO: allow setting an alternate profile than unconfined */ static int __init set_init_ctx(void) { struct cred *cred = (__force struct cred *)current->real_cred; set_cred_label(cred, aa_get_label(ns_unconfined(root_ns))); return 0; } static void destroy_buffers(void) { union aa_buffer *aa_buf; spin_lock(&aa_buffers_lock); while (!list_empty(&aa_global_buffers)) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); spin_unlock(&aa_buffers_lock); kfree(aa_buf); spin_lock(&aa_buffers_lock); } spin_unlock(&aa_buffers_lock); } static int __init alloc_buffers(void) { union aa_buffer *aa_buf; int i, num; /* * per cpu set of cached allocated buffers used to help reduce * lock contention */ for_each_possible_cpu(i) { per_cpu(aa_local_buffers, i).hold = 0; per_cpu(aa_local_buffers, i).count = 0; INIT_LIST_HEAD(&per_cpu(aa_local_buffers, i).head); } /* * A function may require two buffers at once. Usually the buffers are * used for a short period of time and are shared. On UP kernel buffers * two should be enough, with more CPUs it is possible that more * buffers will be used simultaneously. The preallocated pool may grow. * This preallocation has also the side-effect that AppArmor will be * disabled early at boot if aa_g_path_max is extremly high. */ if (num_online_cpus() > 1) num = 4 + RESERVE_COUNT; else num = 2 + RESERVE_COUNT; for (i = 0; i < num; i++) { aa_buf = kmalloc(aa_g_path_max, GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); if (!aa_buf) { destroy_buffers(); return -ENOMEM; } aa_put_buffer(aa_buf->buffer); } return 0; } #ifdef CONFIG_SYSCTL static int apparmor_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!aa_current_policy_admin_capable(NULL)) return -EPERM; if (!apparmor_enabled) return -EINVAL; return proc_dointvec(table, write, buffer, lenp, ppos); } static struct ctl_table apparmor_sysctl_table[] = { #ifdef CONFIG_USER_NS { .procname = "unprivileged_userns_apparmor_policy", .data = &unprivileged_userns_apparmor_policy, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, #endif /* CONFIG_USER_NS */ { .procname = "apparmor_display_secid_mode", .data = &apparmor_display_secid_mode, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, { .procname = "apparmor_restrict_unprivileged_unconfined", .data = &aa_unprivileged_unconfined_restricted, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, }; static int __init apparmor_init_sysctl(void) { return register_sysctl("kernel", apparmor_sysctl_table) ? 0 : -ENOMEM; } #else static inline int apparmor_init_sysctl(void) { return 0; } #endif /* CONFIG_SYSCTL */ #if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK) static unsigned int apparmor_ip_postroute(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct aa_sk_ctx *ctx; struct sock *sk; if (!skb->secmark) return NF_ACCEPT; sk = skb_to_full_sk(skb); if (sk == NULL) return NF_ACCEPT; ctx = aa_sock(sk); if (!apparmor_secmark_check(ctx->label, OP_SENDMSG, AA_MAY_SEND, skb->secmark, sk)) return NF_ACCEPT; return NF_DROP_ERR(-ECONNREFUSED); } static const struct nf_hook_ops apparmor_nf_ops[] = { { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_SELINUX_FIRST, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_SELINUX_FIRST, }, #endif }; static int __net_init apparmor_nf_register(struct net *net) { return nf_register_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static void __net_exit apparmor_nf_unregister(struct net *net) { nf_unregister_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static struct pernet_operations apparmor_net_ops = { .init = apparmor_nf_register, .exit = apparmor_nf_unregister, }; static int __init apparmor_nf_ip_init(void) { int err; if (!apparmor_enabled) return 0; err = register_pernet_subsys(&apparmor_net_ops); if (err) panic("Apparmor: register_pernet_subsys: error %d\n", err); return 0; } __initcall(apparmor_nf_ip_init); #endif static char nulldfa_src[] = { #include "nulldfa.in" }; static struct aa_dfa *nulldfa; static char stacksplitdfa_src[] = { #include "stacksplitdfa.in" }; struct aa_dfa *stacksplitdfa; struct aa_policydb *nullpdb; static int __init aa_setup_dfa_engine(void) { int error = -ENOMEM; nullpdb = aa_alloc_pdb(GFP_KERNEL); if (!nullpdb) return -ENOMEM; nulldfa = aa_dfa_unpack(nulldfa_src, sizeof(nulldfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(nulldfa)) { error = PTR_ERR(nulldfa); goto fail; } nullpdb->dfa = aa_get_dfa(nulldfa); nullpdb->perms = kcalloc(2, sizeof(struct aa_perms), GFP_KERNEL); if (!nullpdb->perms) goto fail; nullpdb->size = 2; stacksplitdfa = aa_dfa_unpack(stacksplitdfa_src, sizeof(stacksplitdfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(stacksplitdfa)) { error = PTR_ERR(stacksplitdfa); goto fail; } return 0; fail: aa_put_pdb(nullpdb); aa_put_dfa(nulldfa); nullpdb = NULL; nulldfa = NULL; stacksplitdfa = NULL; return error; } static void __init aa_teardown_dfa_engine(void) { aa_put_dfa(stacksplitdfa); aa_put_dfa(nulldfa); aa_put_pdb(nullpdb); nullpdb = NULL; stacksplitdfa = NULL; nulldfa = NULL; } static int __init apparmor_init(void) { int error; error = aa_setup_dfa_engine(); if (error) { AA_ERROR("Unable to setup dfa engine\n"); goto alloc_out; } error = aa_alloc_root_ns(); if (error) { AA_ERROR("Unable to allocate default profile namespace\n"); goto alloc_out; } error = apparmor_init_sysctl(); if (error) { AA_ERROR("Unable to register sysctls\n"); goto alloc_out; } error = alloc_buffers(); if (error) { AA_ERROR("Unable to allocate work buffers\n"); goto alloc_out; } error = set_init_ctx(); if (error) { AA_ERROR("Failed to set context on init task\n"); aa_free_root_ns(); goto buffers_out; } security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) aa_info_message("AppArmor initialized: complain mode enabled"); else if (aa_g_profile_mode == APPARMOR_KILL) aa_info_message("AppArmor initialized: kill mode enabled"); else aa_info_message("AppArmor initialized"); return error; buffers_out: destroy_buffers(); alloc_out: aa_destroy_aafs(); aa_teardown_dfa_engine(); apparmor_enabled = false; return error; } DEFINE_LSM(apparmor) = { .name = "apparmor", .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .enabled = &apparmor_enabled, .blobs = &apparmor_blob_sizes, .init = apparmor_init, };
142 142 142 1042 142 1042 1042 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 // SPDX-License-Identifier: GPL-2.0 #include <linux/debugfs.h> #include "netdevsim.h" #define NSIM_DEV_HWSTATS_TRAFFIC_MS 100 static struct list_head * nsim_dev_hwstats_get_list_head(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return &hwstats->l3_list; } WARN_ON_ONCE(1); return NULL; } static void nsim_dev_hwstats_traffic_bump(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; list_for_each_entry(hwsdev, hwsdev_list, list) { if (hwsdev->enabled) { hwsdev->stats.rx_packets += 1; hwsdev->stats.tx_packets += 2; hwsdev->stats.rx_bytes += 100; hwsdev->stats.tx_bytes += 300; } } } static void nsim_dev_hwstats_traffic_work(struct work_struct *work) { struct nsim_dev_hwstats *hwstats; hwstats = container_of(work, struct nsim_dev_hwstats, traffic_dw.work); mutex_lock(&hwstats->hwsdev_list_lock); nsim_dev_hwstats_traffic_bump(hwstats, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_unlock(&hwstats->hwsdev_list_lock); schedule_delayed_work(&hwstats->traffic_dw, msecs_to_jiffies(NSIM_DEV_HWSTATS_TRAFFIC_MS)); } static struct nsim_dev_hwstats_netdev * nsim_dev_hwslist_find_hwsdev(struct list_head *hwsdev_list, int ifindex) { struct nsim_dev_hwstats_netdev *hwsdev; list_for_each_entry(hwsdev, hwsdev_list, list) { if (hwsdev->netdev->ifindex == ifindex) return hwsdev; } return NULL; } static int nsim_dev_hwsdev_enable(struct nsim_dev_hwstats_netdev *hwsdev, struct netlink_ext_ack *extack) { if (hwsdev->fail_enable) { hwsdev->fail_enable = false; NL_SET_ERR_MSG_MOD(extack, "Stats enablement set to fail"); return -ECANCELED; } hwsdev->enabled = true; return 0; } static void nsim_dev_hwsdev_disable(struct nsim_dev_hwstats_netdev *hwsdev) { hwsdev->enabled = false; memset(&hwsdev->stats, 0, sizeof(hwsdev->stats)); } static int nsim_dev_hwsdev_report_delta(struct nsim_dev_hwstats_netdev *hwsdev, struct netdev_notifier_offload_xstats_info *info) { netdev_offload_xstats_report_delta(info->report_delta, &hwsdev->stats); memset(&hwsdev->stats, 0, sizeof(hwsdev->stats)); return 0; } static void nsim_dev_hwsdev_report_used(struct nsim_dev_hwstats_netdev *hwsdev, struct netdev_notifier_offload_xstats_info *info) { if (hwsdev->enabled) netdev_offload_xstats_report_used(info->report_used); } static int nsim_dev_hwstats_event_off_xstats(struct nsim_dev_hwstats *hwstats, struct net_device *dev, unsigned long event, void *ptr) { struct netdev_notifier_offload_xstats_info *info; struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; int err = 0; info = ptr; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, info->type); if (!hwsdev_list) return 0; mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, dev->ifindex); if (!hwsdev) goto out; switch (event) { case NETDEV_OFFLOAD_XSTATS_ENABLE: err = nsim_dev_hwsdev_enable(hwsdev, info->info.extack); break; case NETDEV_OFFLOAD_XSTATS_DISABLE: nsim_dev_hwsdev_disable(hwsdev); break; case NETDEV_OFFLOAD_XSTATS_REPORT_USED: nsim_dev_hwsdev_report_used(hwsdev, info); break; case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: err = nsim_dev_hwsdev_report_delta(hwsdev, info); break; } out: mutex_unlock(&hwstats->hwsdev_list_lock); return err; } static void nsim_dev_hwsdev_fini(struct nsim_dev_hwstats_netdev *hwsdev) { dev_put(hwsdev->netdev); kfree(hwsdev); } static void __nsim_dev_hwstats_event_unregister(struct nsim_dev_hwstats *hwstats, struct net_device *dev, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, dev->ifindex); if (!hwsdev) return; list_del(&hwsdev->list); nsim_dev_hwsdev_fini(hwsdev); } static void nsim_dev_hwstats_event_unregister(struct nsim_dev_hwstats *hwstats, struct net_device *dev) { mutex_lock(&hwstats->hwsdev_list_lock); __nsim_dev_hwstats_event_unregister(hwstats, dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_unlock(&hwstats->hwsdev_list_lock); } static int nsim_dev_hwstats_event(struct nsim_dev_hwstats *hwstats, struct net_device *dev, unsigned long event, void *ptr) { switch (event) { case NETDEV_OFFLOAD_XSTATS_ENABLE: case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: return nsim_dev_hwstats_event_off_xstats(hwstats, dev, event, ptr); case NETDEV_UNREGISTER: nsim_dev_hwstats_event_unregister(hwstats, dev); break; } return 0; } static int nsim_dev_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nsim_dev_hwstats *hwstats; int err = 0; hwstats = container_of(nb, struct nsim_dev_hwstats, netdevice_nb); err = nsim_dev_hwstats_event(hwstats, dev, event, ptr); if (err) return notifier_from_errno(err); return NOTIFY_OK; } static int nsim_dev_hwstats_enable_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; struct nsim_dev *nsim_dev; struct net_device *netdev; bool notify = false; struct net *net; int err = 0; nsim_dev = container_of(hwstats, struct nsim_dev, hwstats); net = nsim_dev_net(nsim_dev); rtnl_lock(); mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (hwsdev) goto out_unlock_list; netdev = dev_get_by_index(net, ifindex); if (!netdev) { err = -ENODEV; goto out_unlock_list; } hwsdev = kzalloc(sizeof(*hwsdev), GFP_KERNEL); if (!hwsdev) { err = -ENOMEM; goto out_put_netdev; } hwsdev->netdev = netdev; list_add_tail(&hwsdev->list, hwsdev_list); mutex_unlock(&hwstats->hwsdev_list_lock); if (netdev_offload_xstats_enabled(netdev, type)) { nsim_dev_hwsdev_enable(hwsdev, NULL); notify = true; } if (notify) rtnl_offload_xstats_notify(netdev); rtnl_unlock(); return err; out_put_netdev: dev_put(netdev); out_unlock_list: mutex_unlock(&hwstats->hwsdev_list_lock); rtnl_unlock(); return err; } static int nsim_dev_hwstats_disable_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; int err = 0; rtnl_lock(); mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (hwsdev) list_del(&hwsdev->list); mutex_unlock(&hwstats->hwsdev_list_lock); if (!hwsdev) { err = -ENOENT; goto unlock_out; } if (netdev_offload_xstats_enabled(hwsdev->netdev, type)) { netdev_offload_xstats_push_delta(hwsdev->netdev, type, &hwsdev->stats); rtnl_offload_xstats_notify(hwsdev->netdev); } nsim_dev_hwsdev_fini(hwsdev); unlock_out: rtnl_unlock(); return err; } static int nsim_dev_hwstats_fail_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; int err = 0; mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (!hwsdev) { err = -ENOENT; goto err_hwsdev_list_unlock; } hwsdev->fail_enable = true; err_hwsdev_list_unlock: mutex_unlock(&hwstats->hwsdev_list_lock); return err; } enum nsim_dev_hwstats_do { NSIM_DEV_HWSTATS_DO_DISABLE, NSIM_DEV_HWSTATS_DO_ENABLE, NSIM_DEV_HWSTATS_DO_FAIL, }; struct nsim_dev_hwstats_fops { const struct file_operations fops; enum nsim_dev_hwstats_do action; enum netdev_offload_xstats_type type; }; static ssize_t nsim_dev_hwstats_do_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_dev_hwstats *hwstats = file->private_data; struct nsim_dev_hwstats_fops *hwsfops; struct list_head *hwsdev_list; int ifindex; int err; hwsfops = container_of(debugfs_real_fops(file), struct nsim_dev_hwstats_fops, fops); err = kstrtoint_from_user(data, count, 0, &ifindex); if (err) return err; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, hwsfops->type); if (WARN_ON(!hwsdev_list)) return -EINVAL; switch (hwsfops->action) { case NSIM_DEV_HWSTATS_DO_DISABLE: err = nsim_dev_hwstats_disable_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; case NSIM_DEV_HWSTATS_DO_ENABLE: err = nsim_dev_hwstats_enable_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; case NSIM_DEV_HWSTATS_DO_FAIL: err = nsim_dev_hwstats_fail_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; } if (err) return err; return count; } #define NSIM_DEV_HWSTATS_FOPS(ACTION, TYPE) \ { \ .fops = { \ .open = simple_open, \ .write = nsim_dev_hwstats_do_write, \ .llseek = generic_file_llseek, \ .owner = THIS_MODULE, \ }, \ .action = ACTION, \ .type = TYPE, \ } static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_disable_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_DISABLE, NETDEV_OFFLOAD_XSTATS_TYPE_L3); static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_enable_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_ENABLE, NETDEV_OFFLOAD_XSTATS_TYPE_L3); static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_fail_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_FAIL, NETDEV_OFFLOAD_XSTATS_TYPE_L3); #undef NSIM_DEV_HWSTATS_FOPS int nsim_dev_hwstats_init(struct nsim_dev *nsim_dev) { struct nsim_dev_hwstats *hwstats = &nsim_dev->hwstats; struct net *net = nsim_dev_net(nsim_dev); int err; mutex_init(&hwstats->hwsdev_list_lock); INIT_LIST_HEAD(&hwstats->l3_list); hwstats->netdevice_nb.notifier_call = nsim_dev_netdevice_event; err = register_netdevice_notifier_net(net, &hwstats->netdevice_nb); if (err) goto err_mutex_destroy; hwstats->ddir = debugfs_create_dir("hwstats", nsim_dev->ddir); if (IS_ERR(hwstats->ddir)) { err = PTR_ERR(hwstats->ddir); goto err_unregister_notifier; } hwstats->l3_ddir = debugfs_create_dir("l3", hwstats->ddir); if (IS_ERR(hwstats->l3_ddir)) { err = PTR_ERR(hwstats->l3_ddir); goto err_remove_hwstats_recursive; } debugfs_create_file("enable_ifindex", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_enable_fops.fops); debugfs_create_file("disable_ifindex", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_disable_fops.fops); debugfs_create_file("fail_next_enable", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_fail_fops.fops); INIT_DELAYED_WORK(&hwstats->traffic_dw, &nsim_dev_hwstats_traffic_work); schedule_delayed_work(&hwstats->traffic_dw, msecs_to_jiffies(NSIM_DEV_HWSTATS_TRAFFIC_MS)); return 0; err_remove_hwstats_recursive: debugfs_remove_recursive(hwstats->ddir); err_unregister_notifier: unregister_netdevice_notifier_net(net, &hwstats->netdevice_nb); err_mutex_destroy: mutex_destroy(&hwstats->hwsdev_list_lock); return err; } static void nsim_dev_hwsdev_list_wipe(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev, *tmp; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; mutex_lock(&hwstats->hwsdev_list_lock); list_for_each_entry_safe(hwsdev, tmp, hwsdev_list, list) { list_del(&hwsdev->list); nsim_dev_hwsdev_fini(hwsdev); } mutex_unlock(&hwstats->hwsdev_list_lock); } void nsim_dev_hwstats_exit(struct nsim_dev *nsim_dev) { struct nsim_dev_hwstats *hwstats = &nsim_dev->hwstats; struct net *net = nsim_dev_net(nsim_dev); cancel_delayed_work_sync(&hwstats->traffic_dw); debugfs_remove_recursive(hwstats->ddir); unregister_netdevice_notifier_net(net, &hwstats->netdevice_nb); nsim_dev_hwsdev_list_wipe(hwstats, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_destroy(&hwstats->hwsdev_list_lock); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SOCK_REUSEPORT_H #define _SOCK_REUSEPORT_H #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/spinlock.h> #include <net/sock.h> extern spinlock_t reuseport_lock; struct sock_reuseport { struct rcu_head rcu; u16 max_socks; /* length of socks */ u16 num_socks; /* elements in socks */ u16 num_closed_socks; /* closed elements in socks */ u16 incoming_cpu; /* The last synq overflow event timestamp of this * reuse->socks[] group. */ unsigned int synq_overflow_ts; /* ID stays the same even after the size of socks[] grows. */ unsigned int reuseport_id; unsigned int bind_inany:1; unsigned int has_conns:1; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[] __counted_by(max_socks); }; extern int reuseport_alloc(struct sock *sk, bool bind_inany); extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); void reuseport_stop_listen_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); struct sock *reuseport_migrate_sock(struct sock *sk, struct sock *migrating_sk, struct sk_buff *skb); extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); extern int reuseport_detach_prog(struct sock *sk); static inline bool reuseport_has_conns(struct sock *sk) { struct sock_reuseport *reuse; bool ret = false; rcu_read_lock(); reuse = rcu_dereference(sk->sk_reuseport_cb); if (reuse && reuse->has_conns) ret = true; rcu_read_unlock(); return ret; } void reuseport_has_conns_set(struct sock *sk); void reuseport_update_incoming_cpu(struct sock *sk, int val); #endif /* _SOCK_REUSEPORT_H */
4 4 8 8 52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 // SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/ethtool.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/bonding.h> #include "bonding_priv.h" static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; loff_t off = 0; rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; bond_for_each_slave_rcu(bond, slave, iter) if (++off == *pos) return slave; return NULL; } static void *bond_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; bool found = false; ++*pos; if (v == SEQ_START_TOKEN) return bond_first_slave_rcu(bond); bond_for_each_slave_rcu(bond, slave, iter) { if (found) return slave; if (slave == v) found = true; } return NULL; } static void bond_info_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void bond_info_show_master(struct seq_file *seq) { struct bonding *bond = pde_data(file_inode(seq->file)); const struct bond_opt_value *optval; struct slave *curr, *primary; int i; curr = rcu_dereference(bond->curr_active_slave); seq_printf(seq, "Bonding Mode: %s", bond_mode_name(BOND_MODE(bond))); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond->params.fail_over_mac) { optval = bond_opt_get_val(BOND_OPT_FAIL_OVER_MAC, bond->params.fail_over_mac); seq_printf(seq, " (fail_over_mac %s)", optval->string); } seq_printf(seq, "\n"); if (bond_mode_uses_xmit_hash(bond)) { optval = bond_opt_get_val(BOND_OPT_XMIT_HASH, bond->params.xmit_policy); seq_printf(seq, "Transmit Hash Policy: %s (%d)\n", optval->string, bond->params.xmit_policy); } if (bond_uses_primary(bond)) { primary = rcu_dereference(bond->primary_slave); seq_printf(seq, "Primary Slave: %s", primary ? primary->dev->name : "None"); if (primary) { optval = bond_opt_get_val(BOND_OPT_PRIMARY_RESELECT, bond->params.primary_reselect); seq_printf(seq, " (primary_reselect %s)", optval->string); } seq_printf(seq, "\nCurrently Active Slave: %s\n", (curr) ? curr->dev->name : "None"); } seq_printf(seq, "MII Status: %s\n", netif_carrier_ok(bond->dev) ? "up" : "down"); seq_printf(seq, "MII Polling Interval (ms): %d\n", bond->params.miimon); seq_printf(seq, "Up Delay (ms): %d\n", bond->params.updelay * bond->params.miimon); seq_printf(seq, "Down Delay (ms): %d\n", bond->params.downdelay * bond->params.miimon); seq_printf(seq, "Peer Notification Delay (ms): %d\n", bond->params.peer_notif_delay * bond->params.miimon); /* ARP information */ if (bond->params.arp_interval > 0) { int printed = 0; seq_printf(seq, "ARP Polling Interval (ms): %d\n", bond->params.arp_interval); seq_printf(seq, "ARP Missed Max: %u\n", bond->params.missed_max); seq_printf(seq, "ARP IP target/s (n.n.n.n form):"); for (i = 0; (i < BOND_MAX_ARP_TARGETS); i++) { if (!bond->params.arp_targets[i]) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI4", &bond->params.arp_targets[i]); printed = 1; } seq_printf(seq, "\n"); #if IS_ENABLED(CONFIG_IPV6) printed = 0; seq_printf(seq, "NS IPv6 target/s (xx::xx form):"); for (i = 0; (i < BOND_MAX_NS_TARGETS); i++) { if (ipv6_addr_any(&bond->params.ns_targets[i])) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI6c", &bond->params.ns_targets[i]); printed = 1; } seq_printf(seq, "\n"); #endif } if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; seq_puts(seq, "\n802.3ad info\n"); seq_printf(seq, "LACP active: %s\n", (bond->params.lacp_active) ? "on" : "off"); seq_printf(seq, "LACP rate: %s\n", (bond->params.lacp_fast) ? "fast" : "slow"); seq_printf(seq, "Min links: %d\n", bond->params.min_links); optval = bond_opt_get_val(BOND_OPT_AD_SELECT, bond->params.ad_select); seq_printf(seq, "Aggregator selection policy (ad_select): %s\n", optval->string); if (capable(CAP_NET_ADMIN)) { seq_printf(seq, "System priority: %d\n", BOND_AD_INFO(bond).system.sys_priority); seq_printf(seq, "System MAC address: %pM\n", &BOND_AD_INFO(bond).system.sys_mac_addr); if (__bond_3ad_get_active_agg_info(bond, &ad_info)) { seq_printf(seq, "bond %s has no active aggregator\n", bond->dev->name); } else { seq_printf(seq, "Active Aggregator Info:\n"); seq_printf(seq, "\tAggregator ID: %d\n", ad_info.aggregator_id); seq_printf(seq, "\tNumber of ports: %d\n", ad_info.ports); seq_printf(seq, "\tActor Key: %d\n", ad_info.actor_key); seq_printf(seq, "\tPartner Key: %d\n", ad_info.partner_key); seq_printf(seq, "\tPartner Mac Address: %pM\n", ad_info.partner_system); } } } } static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { struct bonding *bond = pde_data(file_inode(seq->file)); seq_printf(seq, "\nSlave Interface: %s\n", slave->dev->name); seq_printf(seq, "MII Status: %s\n", bond_slave_link_status(slave->link)); if (slave->speed == SPEED_UNKNOWN) seq_printf(seq, "Speed: %s\n", "Unknown"); else seq_printf(seq, "Speed: %d Mbps\n", slave->speed); if (slave->duplex == DUPLEX_UNKNOWN) seq_printf(seq, "Duplex: %s\n", "Unknown"); else seq_printf(seq, "Duplex: %s\n", slave->duplex ? "full" : "half"); seq_printf(seq, "Link Failure Count: %u\n", slave->link_failure_count); seq_printf(seq, "Permanent HW addr: %*phC\n", slave->dev->addr_len, slave->perm_hwaddr); seq_printf(seq, "Slave queue ID: %d\n", READ_ONCE(slave->queue_id)); if (BOND_MODE(bond) == BOND_MODE_8023AD) { const struct port *port = &SLAVE_AD_INFO(slave)->port; const struct aggregator *agg = port->aggregator; if (agg) { seq_printf(seq, "Aggregator ID: %d\n", agg->aggregator_identifier); seq_printf(seq, "Actor Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_actor_state)); seq_printf(seq, "Partner Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_partner_state)); seq_printf(seq, "Actor Churned Count: %d\n", port->churn_actor_count); seq_printf(seq, "Partner Churned Count: %d\n", port->churn_partner_count); if (capable(CAP_NET_ADMIN)) { seq_puts(seq, "details actor lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->actor_system_priority); seq_printf(seq, " system mac address: %pM\n", &port->actor_system); seq_printf(seq, " port key: %d\n", port->actor_oper_port_key); seq_printf(seq, " port priority: %d\n", port->actor_port_priority); seq_printf(seq, " port number: %d\n", port->actor_port_number); seq_printf(seq, " port state: %d\n", port->actor_oper_port_state); seq_puts(seq, "details partner lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->partner_oper.system_priority); seq_printf(seq, " system mac address: %pM\n", &port->partner_oper.system); seq_printf(seq, " oper key: %d\n", port->partner_oper.key); seq_printf(seq, " port priority: %d\n", port->partner_oper.port_priority); seq_printf(seq, " port number: %d\n", port->partner_oper.port_number); seq_printf(seq, " port state: %d\n", port->partner_oper.port_state); } } else { seq_puts(seq, "Aggregator ID: N/A\n"); } } } static int bond_info_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_printf(seq, "%s\n", bond_version); bond_info_show_master(seq); } else bond_info_show_slave(seq, v); return 0; } static const struct seq_operations bond_info_seq_ops = { .start = bond_info_seq_start, .next = bond_info_seq_next, .stop = bond_info_seq_stop, .show = bond_info_seq_show, }; void bond_create_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir) { bond->proc_entry = proc_create_seq_data(bond_dev->name, 0444, bn->proc_dir, &bond_info_seq_ops, bond); if (bond->proc_entry == NULL) netdev_warn(bond_dev, "Cannot create /proc/net/%s/%s\n", DRV_NAME, bond_dev->name); else memcpy(bond->proc_file_name, bond_dev->name, IFNAMSIZ); } } void bond_remove_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir && bond->proc_entry) { remove_proc_entry(bond->proc_file_name, bn->proc_dir); memset(bond->proc_file_name, 0, IFNAMSIZ); bond->proc_entry = NULL; } } /* Create the bonding directory under /proc/net, if doesn't exist yet. * Caller must hold rtnl_lock. */ void __net_init bond_create_proc_dir(struct bond_net *bn) { if (!bn->proc_dir) { bn->proc_dir = proc_mkdir(DRV_NAME, bn->net->proc_net); if (!bn->proc_dir) pr_warn("Warning: Cannot create /proc/net/%s\n", DRV_NAME); } } /* Destroy the bonding directory under /proc/net, if empty. */ void __net_exit bond_destroy_proc_dir(struct bond_net *bn) { if (bn->proc_dir) { remove_proc_entry(DRV_NAME, bn->net->proc_net); bn->proc_dir = NULL; } }
1208 1354 1170 756 1349 1355 1349 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_RCUREF_H #define _LINUX_RCUREF_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/preempt.h> #include <linux/rcupdate.h> #define RCUREF_ONEREF 0x00000000U #define RCUREF_MAXREF 0x7FFFFFFFU #define RCUREF_SATURATED 0xA0000000U #define RCUREF_RELEASED 0xC0000000U #define RCUREF_DEAD 0xE0000000U #define RCUREF_NOREF 0xFFFFFFFFU /** * rcuref_init - Initialize a rcuref reference count with the given reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) { atomic_set(&ref->refcnt, cnt - 1); } /** * rcuref_read - Read the number of held reference counts of a rcuref * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N) */ static inline unsigned int rcuref_read(rcuref_t *ref) { unsigned int c = atomic_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= RCUREF_RELEASED ? 0 : c + 1; } extern __must_check bool rcuref_get_slowpath(rcuref_t *ref); /** * rcuref_get - Acquire one reference on a rcuref reference count * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See documentation in lib/rcuref.c * * Return: * False if the attempt to acquire a reference failed. This happens * when the last reference has been put already * * True if a reference was successfully acquired */ static inline __must_check bool rcuref_get(rcuref_t *ref) { /* * Unconditionally increase the reference count. The saturation and * dead zones provide enough tolerance for this. */ if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt))) return true; /* Handle the cases inside the saturation and dead zones */ return rcuref_get_slowpath(ref); } extern __must_check bool rcuref_put_slowpath(rcuref_t *ref); /* * Internal helper. Do not invoke directly. */ static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) { RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. */ if (likely(!atomic_add_negative_release(-1, &ref->refcnt))) return false; /* * Handle the last reference drop and cases inside the saturation * and dead zones. */ return rcuref_put_slowpath(ref); } /** * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Can be invoked from contexts, which guarantee that no grace period can * happen which would free the object concurrently if the decrement drops * the last reference and the slowpath races against a concurrent get() and * put() pair. rcu_read_lock()'ed and atomic contexts qualify. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely release the * object which is protected by the reference counter. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref) { return __rcuref_put(ref); } /** * rcuref_put -- Release one reference for a rcuref reference count * @ref: Pointer to the reference count * * Can be invoked from any context. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: * * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ static inline __must_check bool rcuref_put(rcuref_t *ref) { bool released; preempt_disable(); released = __rcuref_put(ref); preempt_enable(); return released; } #endif
292 292 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ #include <linux/mmzone.h> #include <linux/range.h> #include <linux/ioport.h> #include <linux/percpu-refcount.h> struct resource; struct device; /** * struct vmem_altmap - pre-allocated storage for vmemmap_populate * @base_pfn: base of the entire dev_pagemap mapping * @reserve: pages mapped, but reserved for driver use (relative to @base) * @free: free pages set aside in the mapping for memmap storage * @align: pages reserved to meet allocation alignments * @alloc: track pages consumed, private to vmemmap_populate() */ struct vmem_altmap { unsigned long base_pfn; const unsigned long end_pfn; const unsigned long reserve; unsigned long free; unsigned long align; unsigned long alloc; bool inaccessible; }; /* * Specialize ZONE_DEVICE memory into multiple types each has a different * usage. * * MEMORY_DEVICE_PRIVATE: * Device memory that is not directly addressable by the CPU: CPU can neither * read nor write private memory. In this case, we do still have struct pages * backing the device memory. Doing so simplifies the implementation, but it is * important to remember that there are certain points at which the struct page * must be treated as an opaque object, rather than a "normal" struct page. * * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/mm/hmm.rst. * * MEMORY_DEVICE_COHERENT: * Device memory that is cache coherent from device and CPU point of view. This * is used on platforms that have an advanced system bus (like CAPI or CXL). A * driver can hotplug the device memory using ZONE_DEVICE and with that memory * type. Any page of a process can be migrated to such memory. However no one * should be allowed to pin such memory so that it can always be evicted. * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. In support of coordinating page * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a * wakeup event whenever a page is unpinned and becomes idle. This * wakeup is used to coordinate physical address space management (ex: * fs truncate/hole punch) vs pinned pages (ex: device dma). * * MEMORY_DEVICE_GENERIC: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. This is for example used by DAX devices * that expose memory using a character device. * * MEMORY_DEVICE_PCI_P2PDMA: * Device memory residing in a PCI BAR intended for use with Peer-to-Peer * transactions. */ enum memory_type { /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, MEMORY_DEVICE_COHERENT, MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_PCI_P2PDMA, }; struct dev_pagemap_ops { /* * Called once the page refcount reaches 0. The reference count will be * reset to one by the core code after the method is called to prepare * for handing out the page again. */ void (*page_free)(struct page *page); /* * Used for private (un-addressable) device memory only. Must migrate * the page back to a CPU accessible page. */ vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); /* * Handle the memory failure happens on a range of pfns. Notify the * processes who are using these pfns, and try to recover the data on * them if necessary. The mf_flags is finally passed to the recover * function through the whole notify routine. * * When this is not implemented, or it returns -EOPNOTSUPP, the caller * will fall back to a common handler called mf_generic_kill_procs(). */ int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, unsigned long nr_pages, int mf_flags); }; #define PGMAP_ALTMAP_VALID (1 << 0) /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations * @ref: reference count that pins the devm_memremap_pages() mapping * @done: completion for @ref * @type: memory type: see MEMORY_* above in memremap.h * @flags: PGMAP_* flags to specify defailed behavior * @vmemmap_shift: structural definition of how the vmemmap page metadata * is populated, specifically the metadata page order. * A zero value (default) uses base pages as the vmemmap metadata * representation. A bigger value will set up compound struct pages * of the requested order value. * @ops: method table * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no * foreign ZONE_DEVICE memory is accessed. * @nr_range: number of ranges to be mapped * @range: range to be mapped when nr_range == 1 * @ranges: array of ranges to be mapped when nr_range > 1 */ struct dev_pagemap { struct vmem_altmap altmap; struct percpu_ref ref; struct completion done; enum memory_type type; unsigned int flags; unsigned long vmemmap_shift; const struct dev_pagemap_ops *ops; void *owner; int nr_range; union { struct range range; DECLARE_FLEX_ARRAY(struct range, ranges); }; }; static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap) { return pgmap->ops && pgmap->ops->memory_failure; } static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) { if (pgmap->flags & PGMAP_ALTMAP_VALID) return &pgmap->altmap; return NULL; } static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) { return 1 << pgmap->vmemmap_shift; } static inline bool is_device_private_page(const struct page *page) { return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PRIVATE; } static inline bool folio_is_device_private(const struct folio *folio) { return is_device_private_page(&folio->page); } static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } static inline bool is_device_coherent_page(const struct page *page) { return is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_COHERENT; } static inline bool folio_is_device_coherent(const struct folio *folio) { return is_device_coherent_page(&folio->page); } #ifdef CONFIG_ZONE_DEVICE void zone_device_page_init(struct page *page); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { /* * Fail attempts to call devm_memremap_pages() without * ZONE_DEVICE support enabled, this requires callers to fall * back to plain devm_memremap() based on config */ WARN_ON_ONCE(1); return ERR_PTR(-ENXIO); } static inline void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) { } static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap) { return NULL; } static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) { return false; } /* when memremap_pages() is disabled all archs can remap a single page */ static inline unsigned long memremap_compat_align(void) { return PAGE_SIZE; } #endif /* CONFIG_ZONE_DEVICE */ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) { if (pgmap) percpu_ref_put(&pgmap->ref); } #endif /* _LINUX_MEMREMAP_H_ */
5 1289 70 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PID_NS_H #define _LINUX_PID_NS_H #include <linux/sched.h> #include <linux/bug.h> #include <linux/mm.h> #include <linux/workqueue.h> #include <linux/threads.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/idr.h> /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 struct fs_pin; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) /* modes for vm.memfd_noexec sysctl */ #define MEMFD_NOEXEC_SCOPE_EXEC 0 /* MFD_EXEC implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 /* MFD_NOEXEC_SEAL implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 /* same as 1, except MFD_EXEC rejected */ #endif struct pid_namespace { struct idr idr; struct rcu_head rcu; unsigned int pid_allocated; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; struct pid_namespace *parent; #ifdef CONFIG_BSD_PROCESS_ACCT struct fs_pin *bacct; #endif struct user_namespace *user_ns; struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) int memfd_noexec_scope; #endif } __randomize_layout; extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) refcount_inc(&ns->ns.count); return ns; } #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { int scope = MEMFD_NOEXEC_SCOPE_EXEC; for (; ns; ns = ns->parent) scope = max(scope, READ_ONCE(ns->memfd_noexec_scope)); return scope; } #else static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } #endif extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); #else /* !CONFIG_PID_NS */ #include <linux/err.h> static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { return ns; } static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } static inline struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) ns = ERR_PTR(-EINVAL); return ns; } static inline void put_pid_ns(struct pid_namespace *ns) { } static inline void zap_pid_ns_processes(struct pid_namespace *ns) { BUG(); } static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) { return task_active_pid_ns(tsk) == &init_pid_ns; } #endif /* _LINUX_PID_NS_H */
685 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM bpf_trace #if !defined(_TRACE_BPF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BPF_TRACE_H #include <linux/tracepoint.h> TRACE_EVENT(bpf_trace_printk, TP_PROTO(const char *bpf_string), TP_ARGS(bpf_string), TP_STRUCT__entry( __string(bpf_string, bpf_string) ), TP_fast_assign( __assign_str(bpf_string); ), TP_printk("%s", __get_str(bpf_string)) ); #endif /* _TRACE_BPF_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE bpf_trace #include <trace/define_trace.h>
131 41 41 41 5 20 20 19 7 1 5 6 12 20 22 22 22 6 4 4 1 2 2 7 6 6 7 7 38 38 38 38 151 11 5 6 11 11 18 5 5 15 15 15 5 5 5 5 15 57 2 3 3 1 5 27 2 45 2 27 30 14 29 2 30 32 14 40 24 22 18 6 4 14 15 87 87 6 131 131 131 131 131 131 131 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 // SPDX-License-Identifier: GPL-2.0 /* * device_cgroup.c - device cgroup subsystem * * Copyright 2007 IBM Corp */ #include <linux/bpf-cgroup.h> #include <linux/device_cgroup.h> #include <linux/cgroup.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #ifdef CONFIG_CGROUP_DEVICE static DEFINE_MUTEX(devcgroup_mutex); enum devcg_behavior { DEVCG_DEFAULT_NONE, DEVCG_DEFAULT_ALLOW, DEVCG_DEFAULT_DENY, }; /* * exception list locking rules: * hold devcgroup_mutex for update/read. * hold rcu_read_lock() for read. */ struct dev_exception_item { u32 major, minor; short type; short access; struct list_head list; struct rcu_head rcu; }; struct dev_cgroup { struct cgroup_subsys_state css; struct list_head exceptions; enum devcg_behavior behavior; }; static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) { return s ? container_of(s, struct dev_cgroup, css) : NULL; } static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { return css_to_devcgroup(task_css(task, devices_cgrp_id)); } /* * called under devcgroup_mutex */ static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp, *new; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry(ex, orig, list) { new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!new) goto free_and_exit; list_add_tail(&new->list, dest); } return 0; free_and_exit: list_for_each_entry_safe(ex, tmp, dest, list) { list_del(&ex->list); kfree(ex); } return -ENOMEM; } static void dev_exceptions_move(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(ex, tmp, orig, list) { list_move_tail(&ex->list, dest); } } /* * called under devcgroup_mutex */ static int dev_exception_add(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *excopy, *walk; lockdep_assert_held(&devcgroup_mutex); excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!excopy) return -ENOMEM; list_for_each_entry(walk, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access |= ex->access; kfree(excopy); excopy = NULL; } if (excopy != NULL) list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); return 0; } /* * called under devcgroup_mutex */ static void dev_exception_rm(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *walk, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access &= ~ex->access; if (!walk->access) { list_del_rcu(&walk->list); kfree_rcu(walk, rcu); } } } static void __dev_exception_clean(struct dev_cgroup *dev_cgroup) { struct dev_exception_item *ex, *tmp; list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { list_del_rcu(&ex->list); kfree_rcu(ex, rcu); } } /** * dev_exception_clean - frees all entries of the exception list * @dev_cgroup: dev_cgroup with the exception list to be cleaned * * called under devcgroup_mutex */ static void dev_exception_clean(struct dev_cgroup *dev_cgroup) { lockdep_assert_held(&devcgroup_mutex); __dev_exception_clean(dev_cgroup); } static inline bool is_devcg_online(const struct dev_cgroup *devcg) { return (devcg->behavior != DEVCG_DEFAULT_NONE); } /** * devcgroup_online - initializes devcgroup's behavior and exceptions based on * parent's * @css: css getting online * returns 0 in case of success, error code otherwise */ static int devcgroup_online(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent); int ret = 0; mutex_lock(&devcgroup_mutex); if (parent_dev_cgroup == NULL) dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; else { ret = dev_exceptions_copy(&dev_cgroup->exceptions, &parent_dev_cgroup->exceptions); if (!ret) dev_cgroup->behavior = parent_dev_cgroup->behavior; } mutex_unlock(&devcgroup_mutex); return ret; } static void devcgroup_offline(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); mutex_lock(&devcgroup_mutex); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; mutex_unlock(&devcgroup_mutex); } /* * called from kernel/cgroup/cgroup.c with cgroup_lock() held. */ static struct cgroup_subsys_state * devcgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct dev_cgroup *dev_cgroup; dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); if (!dev_cgroup) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&dev_cgroup->exceptions); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; return &dev_cgroup->css; } static void devcgroup_css_free(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); __dev_exception_clean(dev_cgroup); kfree(dev_cgroup); } #define DEVCG_ALLOW 1 #define DEVCG_DENY 2 #define DEVCG_LIST 3 #define MAJMINLEN 13 #define ACCLEN 4 static void set_access(char *acc, short access) { int idx = 0; memset(acc, 0, ACCLEN); if (access & DEVCG_ACC_READ) acc[idx++] = 'r'; if (access & DEVCG_ACC_WRITE) acc[idx++] = 'w'; if (access & DEVCG_ACC_MKNOD) acc[idx++] = 'm'; } static char type_to_char(short type) { if (type == DEVCG_DEV_ALL) return 'a'; if (type == DEVCG_DEV_CHAR) return 'c'; if (type == DEVCG_DEV_BLOCK) return 'b'; return 'X'; } static void set_majmin(char *str, unsigned m) { if (m == ~0) strcpy(str, "*"); else sprintf(str, "%u", m); } static int devcgroup_seq_show(struct seq_file *m, void *v) { struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m)); struct dev_exception_item *ex; char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; rcu_read_lock(); /* * To preserve the compatibility: * - Only show the "all devices" when the default policy is to allow * - List the exceptions in case the default policy is to deny * This way, the file remains as a "whitelist of devices" */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { set_access(acc, DEVCG_ACC_MASK); set_majmin(maj, ~0); set_majmin(min, ~0); seq_printf(m, "%c %s:%s %s\n", type_to_char(DEVCG_DEV_ALL), maj, min, acc); } else { list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { set_access(acc, ex->access); set_majmin(maj, ex->major); set_majmin(min, ex->minor); seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), maj, min, acc); } } rcu_read_unlock(); return 0; } /** * match_exception - iterates the exception list trying to find a complete match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a complete match if an exception is found that will * contain the entire range of provided parameters. * * Return: true in case it matches an exception completely */ static bool match_exception(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; if (ex->major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && ex->minor != minor) continue; /* provided access cannot have more than the exception rule */ if (access & (~ex->access)) continue; return true; } return false; } /** * match_exception_partial - iterates the exception list trying to find a partial match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a partial match if an exception's range is found to * contain *any* of the devices specified by provided parameters. This is * used to make sure no extra access is being granted that is forbidden by * any of the exception list. * * Return: true in case the provided range mat matches an exception completely */ static bool match_exception_partial(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list, lockdep_is_held(&devcgroup_mutex)) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; /* * We must be sure that both the exception and the provided * range aren't masking all devices */ if (ex->major != ~0 && major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && minor != ~0 && ex->minor != minor) continue; /* * In order to make sure the provided range isn't matching * an exception, all its access bits shouldn't match the * exception's access bits */ if (!(access & ex->access)) continue; return true; } return false; } /** * verify_new_ex - verifies if a new exception is allowed by parent cgroup's permissions * @dev_cgroup: dev cgroup to be tested against * @refex: new exception * @behavior: behavior of the exception's dev_cgroup * * This is used to make sure a child cgroup won't have more privileges * than its parent */ static bool verify_new_ex(struct dev_cgroup *dev_cgroup, struct dev_exception_item *refex, enum devcg_behavior behavior) { bool match = false; RCU_LOCKDEP_WARN(!rcu_read_lock_held() && !lockdep_is_held(&devcgroup_mutex), "device_cgroup:verify_new_ex called without proper synchronization"); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) { if (behavior == DEVCG_DEFAULT_ALLOW) { /* * new exception in the child doesn't matter, only * adding extra restrictions */ return true; } else { /* * new exception in the child will add more devices * that can be accessed, so it can't match any of * parent's exceptions, even slightly */ match = match_exception_partial(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) return false; return true; } } else { /* * Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore * the new exception will add access to more devices and must * be contained completely in an parent's exception to be * allowed */ match = match_exception(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) /* parent has an exception that matches the proposed */ return true; else return false; } return false; } /* * parent_has_perm: * when adding a new allow rule to a device exception list, the rule * must be allowed in the parent device */ static int parent_has_perm(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return 1; return verify_new_ex(parent, ex, childcg->behavior); } /** * parent_allows_removal - verify if it's ok to remove an exception * @childcg: child cgroup from where the exception will be removed * @ex: exception being removed * * When removing an exception in cgroups with default ALLOW policy, it must * be checked if removing it will give the child cgroup more access than the * parent. * * Return: true if it's ok to remove exception, false otherwise */ static bool parent_allows_removal(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return true; /* It's always allowed to remove access to devices */ if (childcg->behavior == DEVCG_DEFAULT_DENY) return true; /* * Make sure you're not removing part or a whole exception existing in * the parent cgroup */ return !match_exception_partial(&parent->exceptions, ex->type, ex->major, ex->minor, ex->access); } /** * may_allow_all - checks if it's possible to change the behavior to * allow based on parent's rules. * @parent: device cgroup's parent * returns: != 0 in case it's allowed, 0 otherwise */ static inline int may_allow_all(struct dev_cgroup *parent) { if (!parent) return 1; return parent->behavior == DEVCG_DEFAULT_ALLOW; } /** * revalidate_active_exceptions - walks through the active exception list and * revalidates the exceptions based on parent's * behavior and exceptions. The exceptions that * are no longer valid will be removed. * Called with devcgroup_mutex held. * @devcg: cgroup which exceptions will be checked * * This is one of the three key functions for hierarchy implementation. * This function is responsible for re-evaluating all the cgroup's active * exceptions due to a parent's exception change. * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details. */ static void revalidate_active_exceptions(struct dev_cgroup *devcg) { struct dev_exception_item *ex; struct list_head *this, *tmp; list_for_each_safe(this, tmp, &devcg->exceptions) { ex = container_of(this, struct dev_exception_item, list); if (!parent_has_perm(devcg, ex)) dev_exception_rm(devcg, ex); } } /** * propagate_exception - propagates a new exception to the children * @devcg_root: device cgroup that added a new exception * @ex: new exception to be propagated * * returns: 0 in case of success, != 0 in case of error */ static int propagate_exception(struct dev_cgroup *devcg_root, struct dev_exception_item *ex) { struct cgroup_subsys_state *pos; int rc = 0; rcu_read_lock(); css_for_each_descendant_pre(pos, &devcg_root->css) { struct dev_cgroup *devcg = css_to_devcgroup(pos); /* * Because devcgroup_mutex is held, no devcg will become * online or offline during the tree walk (see on/offline * methods), and online ones are safe to access outside RCU * read lock without bumping refcnt. */ if (pos == &devcg_root->css || !is_devcg_online(devcg)) continue; rcu_read_unlock(); /* * in case both root's behavior and devcg is allow, a new * restriction means adding to the exception list */ if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW && devcg->behavior == DEVCG_DEFAULT_ALLOW) { rc = dev_exception_add(devcg, ex); if (rc) return rc; } else { /* * in the other possible cases: * root's behavior: allow, devcg's: deny * root's behavior: deny, devcg's: deny * the exception will be removed */ dev_exception_rm(devcg, ex); } revalidate_active_exceptions(devcg); rcu_read_lock(); } rcu_read_unlock(); return rc; } /* * Modify the exception list using allow/deny rules. * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD * so we can give a container CAP_MKNOD to let it create devices but not * modify the exception list. * It seems likely we'll want to add a CAP_CONTAINER capability to allow * us to also grant CAP_SYS_ADMIN to containers without giving away the * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN * * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting * new access is only allowed if you're in the top-level cgroup, or your * parent cgroup has the access you're asking for. */ static int devcgroup_update_access(struct dev_cgroup *devcgroup, int filetype, char *buffer) { const char *b; char temp[12]; /* 11 + 1 characters needed for a u32 */ int count, rc = 0; struct dev_exception_item ex; struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent); struct dev_cgroup tmp_devcgrp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; memset(&ex, 0, sizeof(ex)); memset(&tmp_devcgrp, 0, sizeof(tmp_devcgrp)); b = buffer; switch (*b) { case 'a': switch (filetype) { case DEVCG_ALLOW: if (css_has_online_children(&devcgroup->css)) return -EINVAL; if (!may_allow_all(parent)) return -EPERM; if (!parent) { devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(devcgroup); break; } INIT_LIST_HEAD(&tmp_devcgrp.exceptions); rc = dev_exceptions_copy(&tmp_devcgrp.exceptions, &devcgroup->exceptions); if (rc) return rc; dev_exception_clean(devcgroup); rc = dev_exceptions_copy(&devcgroup->exceptions, &parent->exceptions); if (rc) { dev_exceptions_move(&devcgroup->exceptions, &tmp_devcgrp.exceptions); return rc; } devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(&tmp_devcgrp); break; case DEVCG_DENY: if (css_has_online_children(&devcgroup->css)) return -EINVAL; dev_exception_clean(devcgroup); devcgroup->behavior = DEVCG_DEFAULT_DENY; break; default: return -EINVAL; } return 0; case 'b': ex.type = DEVCG_DEV_BLOCK; break; case 'c': ex.type = DEVCG_DEV_CHAR; break; default: return -EINVAL; } b++; if (!isspace(*b)) return -EINVAL; b++; if (*b == '*') { ex.major = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.major); if (rc) return -EINVAL; } else { return -EINVAL; } if (*b != ':') return -EINVAL; b++; /* read minor */ if (*b == '*') { ex.minor = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.minor); if (rc) return -EINVAL; } else { return -EINVAL; } if (!isspace(*b)) return -EINVAL; for (b++, count = 0; count < 3; count++, b++) { switch (*b) { case 'r': ex.access |= DEVCG_ACC_READ; break; case 'w': ex.access |= DEVCG_ACC_WRITE; break; case 'm': ex.access |= DEVCG_ACC_MKNOD; break; case '\n': case '\0': count = 3; break; default: return -EINVAL; } } switch (filetype) { case DEVCG_ALLOW: /* * If the default policy is to allow by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { /* Check if the parent allows removing it first */ if (!parent_allows_removal(devcgroup, &ex)) return -EPERM; dev_exception_rm(devcgroup, &ex); break; } if (!parent_has_perm(devcgroup, &ex)) return -EPERM; rc = dev_exception_add(devcgroup, &ex); break; case DEVCG_DENY: /* * If the default policy is to deny by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_DENY) dev_exception_rm(devcgroup, &ex); else rc = dev_exception_add(devcgroup, &ex); if (rc) break; /* we only propagate new restrictions */ rc = propagate_exception(devcgroup, &ex); break; default: rc = -EINVAL; } return rc; } static ssize_t devcgroup_access_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int retval; mutex_lock(&devcgroup_mutex); retval = devcgroup_update_access(css_to_devcgroup(of_css(of)), of_cft(of)->private, strstrip(buf)); mutex_unlock(&devcgroup_mutex); return retval ?: nbytes; } static struct cftype dev_cgroup_files[] = { { .name = "allow", .write = devcgroup_access_write, .private = DEVCG_ALLOW, }, { .name = "deny", .write = devcgroup_access_write, .private = DEVCG_DENY, }, { .name = "list", .seq_show = devcgroup_seq_show, .private = DEVCG_LIST, }, { } /* terminate */ }; struct cgroup_subsys devices_cgrp_subsys = { .css_alloc = devcgroup_css_alloc, .css_free = devcgroup_css_free, .css_online = devcgroup_online, .css_offline = devcgroup_offline, .legacy_cftypes = dev_cgroup_files, }; /** * devcgroup_legacy_check_permission - checks if an inode operation is permitted * @type: device type * @major: device major number * @minor: device minor number * @access: combination of DEVCG_ACC_WRITE, DEVCG_ACC_READ and DEVCG_ACC_MKNOD * * returns 0 on success, -EPERM case the operation is not permitted */ static int devcgroup_legacy_check_permission(short type, u32 major, u32 minor, short access) { struct dev_cgroup *dev_cgroup; bool rc; rcu_read_lock(); dev_cgroup = task_devcgroup(current); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) /* Can't match any of the exceptions, even partially */ rc = !match_exception_partial(&dev_cgroup->exceptions, type, major, minor, access); else /* Need to match completely one exception to be allowed */ rc = match_exception(&dev_cgroup->exceptions, type, major, minor, access); rcu_read_unlock(); if (!rc) return -EPERM; return 0; } #endif /* CONFIG_CGROUP_DEVICE */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); if (rc) return rc; #ifdef CONFIG_CGROUP_DEVICE return devcgroup_legacy_check_permission(type, major, minor, access); #else /* CONFIG_CGROUP_DEVICE */ return 0; #endif /* CONFIG_CGROUP_DEVICE */ } EXPORT_SYMBOL(devcgroup_check_permission); #endif /* defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) */
2 2 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 // SPDX-License-Identifier: GPL-2.0-or-later /* * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828). * * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk> * * Changes: * Fixes: */ #define pr_fmt(fmt) "UDPLite: " fmt #include <linux/export.h> #include <linux/proc_fs.h> #include "udp_impl.h" struct udp_table udplite_table __read_mostly; EXPORT_SYMBOL(udplite_table); /* Designate sk as UDP-Lite socket */ static int udplite_sk_init(struct sock *sk) { udp_init_sock(sk); pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, " "please contact the netdev mailing list\n"); return 0; } static int udplite_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } static int udplite_err(struct sk_buff *skb, u32 info) { return __udp4_lib_err(skb, info, &udplite_table); } static const struct net_protocol udplite_protocol = { .handler = udplite_rcv, .err_handler = udplite_err, .no_policy = 1, }; struct proto udplite_prot = { .name = "UDP-Lite", .owner = THIS_MODULE, .close = udp_lib_close, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udplite_sk_init, .destroy = udp_destroy_sock, .setsockopt = udp_setsockopt, .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .memory_allocated = &udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, }; EXPORT_SYMBOL(udplite_prot); static struct inet_protosw udplite4_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_UDPLITE, .prot = &udplite_prot, .ops = &inet_dgram_ops, .flags = INET_PROTOSW_PERMANENT, }; #ifdef CONFIG_PROC_FS static struct udp_seq_afinfo udplite4_seq_afinfo = { .family = AF_INET, .udp_table = &udplite_table, }; static int __net_init udplite4_proc_init_net(struct net *net) { if (!proc_create_net_data("udplite", 0444, net->proc_net, &udp_seq_ops, sizeof(struct udp_iter_state), &udplite4_seq_afinfo)) return -ENOMEM; return 0; } static void __net_exit udplite4_proc_exit_net(struct net *net) { remove_proc_entry("udplite", net->proc_net); } static struct pernet_operations udplite4_net_ops = { .init = udplite4_proc_init_net, .exit = udplite4_proc_exit_net, }; static __init int udplite4_proc_init(void) { return register_pernet_subsys(&udplite4_net_ops); } #else static inline int udplite4_proc_init(void) { return 0; } #endif void __init udplite4_register(void) { udp_table_init(&udplite_table, "UDP-Lite"); if (proto_register(&udplite_prot, 1)) goto out_register_err; if (inet_add_protocol(&udplite_protocol, IPPROTO_UDPLITE) < 0) goto out_unregister_proto; inet_register_protosw(&udplite4_protosw); if (udplite4_proc_init()) pr_err("%s: Cannot register /proc!\n", __func__); return; out_unregister_proto: proto_unregister(&udplite_prot); out_register_err: pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Reiner Sailer <sailer@watson.ibm.com> * Mimi Zohar <zohar@us.ibm.com> * * File: ima.h * internal Integrity Measurement Architecture (IMA) definitions */ #ifndef __LINUX_IMA_H #define __LINUX_IMA_H #include <linux/types.h> #include <linux/crypto.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/hash.h> #include <linux/tpm.h> #include <linux/audit.h> #include <crypto/hash_info.h> #include "../integrity.h" enum ima_show_type { IMA_SHOW_BINARY, IMA_SHOW_BINARY_NO_FIELD_LEN, IMA_SHOW_BINARY_OLD_STRING_FMT, IMA_SHOW_ASCII }; enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8, TPM_PCR10 = 10 }; /* digest size for IMA, fits SHA1 or MD5 */ #define IMA_DIGEST_SIZE SHA1_DIGEST_SIZE #define IMA_EVENT_NAME_LEN_MAX 255 #define IMA_HASH_BITS 10 #define IMA_MEASURE_HTABLE_SIZE (1 << IMA_HASH_BITS) #define IMA_TEMPLATE_FIELD_ID_MAX_LEN 16 #define IMA_TEMPLATE_NUM_FIELDS_MAX 15 #define IMA_TEMPLATE_IMA_NAME "ima" #define IMA_TEMPLATE_IMA_FMT "d|n" #define NR_BANKS(chip) ((chip != NULL) ? chip->nr_allocated_banks : 0) /* current content of the policy */ extern int ima_policy_flag; /* bitset of digests algorithms allowed in the setxattr hook */ extern atomic_t ima_setxattr_allowed_hash_algorithms; /* IMA hash algorithm description */ struct ima_algo_desc { struct crypto_shash *tfm; enum hash_algo algo; }; /* set during initialization */ extern int ima_hash_algo __ro_after_init; extern int ima_sha1_idx __ro_after_init; extern int ima_hash_algo_idx __ro_after_init; extern int ima_extra_slots __ro_after_init; extern struct ima_algo_desc *ima_algo_array __ro_after_init; extern int ima_appraise; extern struct tpm_chip *ima_tpm_chip; extern const char boot_aggregate_name[]; /* IMA event related data */ struct ima_event_data { struct ima_iint_cache *iint; struct file *file; const unsigned char *filename; struct evm_ima_xattr_data *xattr_value; int xattr_len; const struct modsig *modsig; const char *violation; const void *buf; int buf_len; }; /* IMA template field data definition */ struct ima_field_data { u8 *data; u32 len; }; /* IMA template field definition */ struct ima_template_field { const char field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN]; int (*field_init)(struct ima_event_data *event_data, struct ima_field_data *field_data); void (*field_show)(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data); }; /* IMA template descriptor definition */ struct ima_template_desc { struct list_head list; char *name; char *fmt; int num_fields; const struct ima_template_field **fields; }; struct ima_template_entry { int pcr; struct tpm_digest *digests; struct ima_template_desc *template_desc; /* template descriptor */ u32 template_data_len; struct ima_field_data template_data[]; /* template related data */ }; struct ima_queue_entry { struct hlist_node hnext; /* place in hash collision list */ struct list_head later; /* place in ima_measurements list */ struct ima_template_entry *entry; }; extern struct list_head ima_measurements; /* list of all measurements */ /* Some details preceding the binary serialized measurement list */ struct ima_kexec_hdr { u16 version; u16 _reserved0; u32 _reserved1; u64 buffer_size; u64 count; }; /* IMA iint action cache flags */ #define IMA_MEASURE 0x00000001 #define IMA_MEASURED 0x00000002 #define IMA_APPRAISE 0x00000004 #define IMA_APPRAISED 0x00000008 /*#define IMA_COLLECT 0x00000010 do not use this flag */ #define IMA_COLLECTED 0x00000020 #define IMA_AUDIT 0x00000040 #define IMA_AUDITED 0x00000080 #define IMA_HASH 0x00000100 #define IMA_HASHED 0x00000200 /* IMA iint policy rule cache flags */ #define IMA_NONACTION_FLAGS 0xff000000 #define IMA_DIGSIG_REQUIRED 0x01000000 #define IMA_PERMIT_DIRECTIO 0x02000000 #define IMA_NEW_FILE 0x04000000 #define IMA_FAIL_UNVERIFIABLE_SIGS 0x10000000 #define IMA_MODSIG_ALLOWED 0x20000000 #define IMA_CHECK_BLACKLIST 0x40000000 #define IMA_VERITY_REQUIRED 0x80000000 #define IMA_DO_MASK (IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \ IMA_HASH | IMA_APPRAISE_SUBMASK) #define IMA_DONE_MASK (IMA_MEASURED | IMA_APPRAISED | IMA_AUDITED | \ IMA_HASHED | IMA_COLLECTED | \ IMA_APPRAISED_SUBMASK) /* IMA iint subaction appraise cache flags */ #define IMA_FILE_APPRAISE 0x00001000 #define IMA_FILE_APPRAISED 0x00002000 #define IMA_MMAP_APPRAISE 0x00004000 #define IMA_MMAP_APPRAISED 0x00008000 #define IMA_BPRM_APPRAISE 0x00010000 #define IMA_BPRM_APPRAISED 0x00020000 #define IMA_READ_APPRAISE 0x00040000 #define IMA_READ_APPRAISED 0x00080000 #define IMA_CREDS_APPRAISE 0x00100000 #define IMA_CREDS_APPRAISED 0x00200000 #define IMA_APPRAISE_SUBMASK (IMA_FILE_APPRAISE | IMA_MMAP_APPRAISE | \ IMA_BPRM_APPRAISE | IMA_READ_APPRAISE | \ IMA_CREDS_APPRAISE) #define IMA_APPRAISED_SUBMASK (IMA_FILE_APPRAISED | IMA_MMAP_APPRAISED | \ IMA_BPRM_APPRAISED | IMA_READ_APPRAISED | \ IMA_CREDS_APPRAISED) /* IMA iint cache atomic_flags */ #define IMA_CHANGE_XATTR 0 #define IMA_UPDATE_XATTR 1 #define IMA_CHANGE_ATTR 2 #define IMA_DIGSIG 3 #define IMA_MUST_MEASURE 4 /* IMA integrity metadata associated with an inode */ struct ima_iint_cache { struct mutex mutex; /* protects: version, flags, digest */ struct integrity_inode_attributes real_inode; unsigned long flags; unsigned long measured_pcrs; unsigned long atomic_flags; enum integrity_status ima_file_status:4; enum integrity_status ima_mmap_status:4; enum integrity_status ima_bprm_status:4; enum integrity_status ima_read_status:4; enum integrity_status ima_creds_status:4; struct ima_digest_data *ima_hash; }; extern struct lsm_blob_sizes ima_blob_sizes; static inline struct ima_iint_cache * ima_inode_get_iint(const struct inode *inode) { struct ima_iint_cache **iint_sec; if (unlikely(!inode->i_security)) return NULL; iint_sec = inode->i_security + ima_blob_sizes.lbs_inode; return *iint_sec; } static inline void ima_inode_set_iint(const struct inode *inode, struct ima_iint_cache *iint) { struct ima_iint_cache **iint_sec; if (unlikely(!inode->i_security)) return; iint_sec = inode->i_security + ima_blob_sizes.lbs_inode; *iint_sec = iint; } struct ima_iint_cache *ima_iint_find(struct inode *inode); struct ima_iint_cache *ima_inode_get(struct inode *inode); void ima_inode_free_rcu(void *inode_security); void __init ima_iintcache_init(void); extern const int read_idmap[]; #ifdef CONFIG_HAVE_IMA_KEXEC void ima_load_kexec_buffer(void); #else static inline void ima_load_kexec_buffer(void) {} #endif /* CONFIG_HAVE_IMA_KEXEC */ #ifdef CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS void ima_post_key_create_or_update(struct key *keyring, struct key *key, const void *payload, size_t plen, unsigned long flags, bool create); #endif /* * The default binary_runtime_measurements list format is defined as the * platform native format. The canonical format is defined as little-endian. */ extern bool ima_canonical_fmt; /* Internal IMA function definitions */ int ima_init(void); int ima_fs_init(void); int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename); int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash); int ima_calc_buffer_hash(const void *buf, loff_t len, struct ima_digest_data *hash); int ima_calc_field_array_hash(struct ima_field_data *field_data, struct ima_template_entry *entry); int ima_calc_boot_aggregate(struct ima_digest_data *hash); void ima_add_violation(struct file *file, const unsigned char *filename, struct ima_iint_cache *iint, const char *op, const char *cause); int ima_init_crypto(void); void ima_putc(struct seq_file *m, void *data, int datalen); void ima_print_digest(struct seq_file *m, u8 *digest, u32 size); int template_desc_init_fields(const char *template_fmt, const struct ima_template_field ***fields, int *num_fields); struct ima_template_desc *ima_template_desc_current(void); struct ima_template_desc *ima_template_desc_buf(void); struct ima_template_desc *lookup_template_desc(const char *name); bool ima_template_has_modsig(const struct ima_template_desc *ima_template); int ima_restore_measurement_entry(struct ima_template_entry *entry); int ima_restore_measurement_list(loff_t bufsize, void *buf); int ima_measurements_show(struct seq_file *m, void *v); unsigned long ima_get_binary_runtime_size(void); int ima_init_template(void); void ima_init_template_list(void); int __init ima_init_digests(void); int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event, void *lsm_data); /* * used to protect h_table and sha_table */ extern spinlock_t ima_queue_lock; struct ima_h_table { atomic_long_t len; /* number of stored measurements in the list */ atomic_long_t violations; struct hlist_head queue[IMA_MEASURE_HTABLE_SIZE]; }; extern struct ima_h_table ima_htable; static inline unsigned int ima_hash_key(u8 *digest) { /* there is no point in taking a hash of part of a digest */ return (digest[0] | digest[1] << 8) % IMA_MEASURE_HTABLE_SIZE; } #define __ima_hooks(hook) \ hook(NONE, none) \ hook(FILE_CHECK, file) \ hook(MMAP_CHECK, mmap) \ hook(MMAP_CHECK_REQPROT, mmap_reqprot) \ hook(BPRM_CHECK, bprm) \ hook(CREDS_CHECK, creds) \ hook(POST_SETATTR, post_setattr) \ hook(MODULE_CHECK, module) \ hook(FIRMWARE_CHECK, firmware) \ hook(KEXEC_KERNEL_CHECK, kexec_kernel) \ hook(KEXEC_INITRAMFS_CHECK, kexec_initramfs) \ hook(POLICY_CHECK, policy) \ hook(KEXEC_CMDLINE, kexec_cmdline) \ hook(KEY_CHECK, key) \ hook(CRITICAL_DATA, critical_data) \ hook(SETXATTR_CHECK, setxattr_check) \ hook(MAX_CHECK, none) #define __ima_hook_enumify(ENUM, str) ENUM, #define __ima_stringify(arg) (#arg) #define __ima_hook_measuring_stringify(ENUM, str) \ (__ima_stringify(measuring_ ##str)), enum ima_hooks { __ima_hooks(__ima_hook_enumify) }; static const char * const ima_hooks_measure_str[] = { __ima_hooks(__ima_hook_measuring_stringify) }; static inline const char *func_measure_str(enum ima_hooks func) { if (func >= MAX_CHECK) return ima_hooks_measure_str[NONE]; return ima_hooks_measure_str[func]; } extern const char *const func_tokens[]; struct modsig; #ifdef CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS /* * To track keys that need to be measured. */ struct ima_key_entry { struct list_head list; void *payload; size_t payload_len; char *keyring_name; }; void ima_init_key_queue(void); bool ima_should_queue_key(void); bool ima_queue_key(struct key *keyring, const void *payload, size_t payload_len); void ima_process_queued_keys(void); #else static inline void ima_init_key_queue(void) {} static inline bool ima_should_queue_key(void) { return false; } static inline bool ima_queue_key(struct key *keyring, const void *payload, size_t payload_len) { return false; } static inline void ima_process_queued_keys(void) {} #endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */ /* LIM API function definitions */ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, struct lsm_prop *prop, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos); int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func); int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file, void *buf, loff_t size, enum hash_algo algo, struct modsig *modsig); void ima_store_measurement(struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc); int process_buffer_measurement(struct mnt_idmap *idmap, struct inode *inode, const void *buf, int size, const char *eventname, enum ima_hooks func, int pcr, const char *func_data, bool buf_hash, u8 *digest, size_t digest_len); void ima_audit_measurement(struct ima_iint_cache *iint, const unsigned char *filename); int ima_alloc_init_template(struct ima_event_data *event_data, struct ima_template_entry **entry, struct ima_template_desc *template_desc); int ima_store_template(struct ima_template_entry *entry, int violation, struct inode *inode, const unsigned char *filename, int pcr); void ima_free_template_entry(struct ima_template_entry *entry); const char *ima_d_path(const struct path *path, char **pathbuf, char *filename); /* IMA policy related functions */ int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, struct lsm_prop *prop, enum ima_hooks func, int mask, int flags, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos); void ima_init_policy(void); void ima_update_policy(void); void ima_update_policy_flags(void); ssize_t ima_parse_add_rule(char *); void ima_delete_rules(void); int ima_check_policy(void); void *ima_policy_start(struct seq_file *m, loff_t *pos); void *ima_policy_next(struct seq_file *m, void *v, loff_t *pos); void ima_policy_stop(struct seq_file *m, void *v); int ima_policy_show(struct seq_file *m, void *v); /* Appraise integrity measurements */ #define IMA_APPRAISE_ENFORCE 0x01 #define IMA_APPRAISE_FIX 0x02 #define IMA_APPRAISE_LOG 0x04 #define IMA_APPRAISE_MODULES 0x08 #define IMA_APPRAISE_FIRMWARE 0x10 #define IMA_APPRAISE_POLICY 0x20 #define IMA_APPRAISE_KEXEC 0x40 #ifdef CONFIG_IMA_APPRAISE int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr); int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig); int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func); void ima_update_xattr(struct ima_iint_cache *iint, struct file *file); enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func); enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len); int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len); void __init init_ima_appraise_lsm(const struct lsm_id *lsmid); #else static inline int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { return 0; } static inline int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig) { return INTEGRITY_UNKNOWN; } static inline int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { return 0; } static inline void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { } static inline enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { return INTEGRITY_UNKNOWN; } static inline enum hash_algo ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, int xattr_len) { return ima_hash_algo; } static inline int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { return 0; } static inline void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { } #endif /* CONFIG_IMA_APPRAISE */ #ifdef CONFIG_IMA_APPRAISE_MODSIG int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig); void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size); int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, const u8 **digest, u32 *digest_size); int ima_get_raw_modsig(const struct modsig *modsig, const void **data, u32 *data_len); void ima_free_modsig(struct modsig *modsig); #else static inline int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig) { return -EOPNOTSUPP; } static inline void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size) { } static inline int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, const u8 **digest, u32 *digest_size) { return -EOPNOTSUPP; } static inline int ima_get_raw_modsig(const struct modsig *modsig, const void **data, u32 *data_len) { return -EOPNOTSUPP; } static inline void ima_free_modsig(struct modsig *modsig) { } #endif /* CONFIG_IMA_APPRAISE_MODSIG */ /* LSM based policy rules require audit */ #ifdef CONFIG_IMA_LSM_RULES #define ima_filter_rule_init security_audit_rule_init #define ima_filter_rule_free security_audit_rule_free #define ima_filter_rule_match security_audit_rule_match #else static inline int ima_filter_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, gfp_t gfp) { return -EINVAL; } static inline void ima_filter_rule_free(void *lsmrule) { } static inline int ima_filter_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *lsmrule) { return -EINVAL; } #endif /* CONFIG_IMA_LSM_RULES */ #ifdef CONFIG_IMA_READ_POLICY #define POLICY_FILE_FLAGS (S_IWUSR | S_IRUSR) #else #define POLICY_FILE_FLAGS S_IWUSR #endif /* CONFIG_IMA_READ_POLICY */ #endif /* __LINUX_IMA_H */
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/dsa_stubs.h - Stubs for the Distributed Switch Architecture framework */ #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/net_tstamp.h> #include <net/dsa.h> #if IS_ENABLED(CONFIG_NET_DSA) extern const struct dsa_stubs *dsa_stubs; struct dsa_stubs { int (*conduit_hwtstamp_validate)(struct net_device *dev, const struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); }; static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev, const struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack) { if (!netdev_uses_dsa(dev)) return 0; /* rtnl_lock() is a sufficient guarantee, because as long as * netdev_uses_dsa() returns true, the dsa_core module is still * registered, and so, dsa_unregister_stubs() couldn't have run. * For netdev_uses_dsa() to start returning false, it would imply that * dsa_conduit_teardown() has executed, which requires rtnl_lock(). */ ASSERT_RTNL(); return dsa_stubs->conduit_hwtstamp_validate(dev, config, extack); } #else static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev, const struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack) { return 0; } #endif
29 29 29 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2018 - 2021, 2023 - 2024 Intel Corporation */ #include <net/cfg80211.h> #include "core.h" #include "nl80211.h" #include "rdev-ops.h" static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, struct nlattr *ftmreq, struct cfg80211_pmsr_request_peer *out, struct genl_info *info) { const struct cfg80211_pmsr_capabilities *capa = rdev->wiphy.pmsr_capa; struct nlattr *tb[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1]; u32 preamble = NL80211_PREAMBLE_DMG; /* only optional in DMG */ /* validate existing data */ if (!(rdev->wiphy.pmsr_capa->ftm.bandwidths & BIT(out->chandef.width))) { NL_SET_ERR_MSG(info->extack, "FTM: unsupported bandwidth"); return -EINVAL; } /* no validation needed - was already done via nested policy */ nla_parse_nested_deprecated(tb, NL80211_PMSR_FTM_REQ_ATTR_MAX, ftmreq, NULL, NULL); if (tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]) preamble = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]); /* set up values - struct is 0-initialized */ out->ftm.requested = true; switch (out->chandef.chan->band) { case NL80211_BAND_60GHZ: /* optional */ break; default: if (!tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]) { NL_SET_ERR_MSG(info->extack, "FTM: must specify preamble"); return -EINVAL; } } if (!(capa->ftm.preambles & BIT(preamble))) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE], "FTM: invalid preamble"); return -EINVAL; } out->ftm.preamble = preamble; out->ftm.burst_period = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]) out->ftm.burst_period = nla_get_u16(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]); out->ftm.asap = !!tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP]; if (out->ftm.asap && !capa->ftm.asap) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP], "FTM: ASAP mode not supported"); return -EINVAL; } if (!out->ftm.asap && !capa->ftm.non_asap) { NL_SET_ERR_MSG(info->extack, "FTM: non-ASAP mode not supported"); return -EINVAL; } out->ftm.num_bursts_exp = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]) out->ftm.num_bursts_exp = nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]); if (capa->ftm.max_bursts_exponent >= 0 && out->ftm.num_bursts_exp > capa->ftm.max_bursts_exponent) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP], "FTM: max NUM_BURSTS_EXP must be set lower than the device limit"); return -EINVAL; } out->ftm.burst_duration = 15; if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]) out->ftm.burst_duration = nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); out->ftm.ftms_per_burst = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]) out->ftm.ftms_per_burst = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]); if (capa->ftm.max_ftms_per_burst && (out->ftm.ftms_per_burst > capa->ftm.max_ftms_per_burst || out->ftm.ftms_per_burst == 0)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST], "FTM: FTMs per burst must be set lower than the device limit but non-zero"); return -EINVAL; } out->ftm.ftmr_retries = 3; if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]) out->ftm.ftmr_retries = nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]); out->ftm.request_lci = !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI]; if (out->ftm.request_lci && !capa->ftm.request_lci) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI], "FTM: LCI request not supported"); } out->ftm.request_civicloc = !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC]; if (out->ftm.request_civicloc && !capa->ftm.request_civicloc) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC], "FTM: civic location request not supported"); } out->ftm.trigger_based = !!tb[NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED]; if (out->ftm.trigger_based && !capa->ftm.trigger_based) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED], "FTM: trigger based ranging is not supported"); return -EINVAL; } out->ftm.non_trigger_based = !!tb[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED]; if (out->ftm.non_trigger_based && !capa->ftm.non_trigger_based) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED], "FTM: trigger based ranging is not supported"); return -EINVAL; } if (out->ftm.trigger_based && out->ftm.non_trigger_based) { NL_SET_ERR_MSG(info->extack, "FTM: can't set both trigger based and non trigger based"); return -EINVAL; } if (out->ftm.ftms_per_burst > 31 && !out->ftm.non_trigger_based && !out->ftm.trigger_based) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST], "FTM: FTMs per burst must be set lower than 31"); return -ERANGE; } if ((out->ftm.trigger_based || out->ftm.non_trigger_based) && out->ftm.preamble != NL80211_PREAMBLE_HE) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE], "FTM: non EDCA based ranging must use HE preamble"); return -EINVAL; } out->ftm.lmr_feedback = !!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK]; if (!out->ftm.trigger_based && !out->ftm.non_trigger_based && out->ftm.lmr_feedback) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK], "FTM: LMR feedback set for EDCA based ranging"); return -EINVAL; } if (tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]) { if (!out->ftm.non_trigger_based && !out->ftm.trigger_based) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR], "FTM: BSS color set for EDCA based ranging"); return -EINVAL; } out->ftm.bss_color = nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]); } return 0; } static int pmsr_parse_peer(struct cfg80211_registered_device *rdev, struct nlattr *peer, struct cfg80211_pmsr_request_peer *out, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_PEER_ATTR_MAX + 1]; struct nlattr *req[NL80211_PMSR_REQ_ATTR_MAX + 1]; struct nlattr *treq; int err, rem; /* no validation needed - was already done via nested policy */ nla_parse_nested_deprecated(tb, NL80211_PMSR_PEER_ATTR_MAX, peer, NULL, NULL); if (!tb[NL80211_PMSR_PEER_ATTR_ADDR] || !tb[NL80211_PMSR_PEER_ATTR_CHAN] || !tb[NL80211_PMSR_PEER_ATTR_REQ]) { NL_SET_ERR_MSG_ATTR(info->extack, peer, "insufficient peer data"); return -EINVAL; } memcpy(out->addr, nla_data(tb[NL80211_PMSR_PEER_ATTR_ADDR]), ETH_ALEN); /* reuse info->attrs */ memset(info->attrs, 0, sizeof(*info->attrs) * (NL80211_ATTR_MAX + 1)); err = nla_parse_nested_deprecated(info->attrs, NL80211_ATTR_MAX, tb[NL80211_PMSR_PEER_ATTR_CHAN], NULL, info->extack); if (err) return err; err = nl80211_parse_chandef(rdev, info, &out->chandef); if (err) return err; /* no validation needed - was already done via nested policy */ nla_parse_nested_deprecated(req, NL80211_PMSR_REQ_ATTR_MAX, tb[NL80211_PMSR_PEER_ATTR_REQ], NULL, NULL); if (!req[NL80211_PMSR_REQ_ATTR_DATA]) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_PEER_ATTR_REQ], "missing request type/data"); return -EINVAL; } if (req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF]) out->report_ap_tsf = true; if (out->report_ap_tsf && !rdev->wiphy.pmsr_capa->report_ap_tsf) { NL_SET_ERR_MSG_ATTR(info->extack, req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF], "reporting AP TSF is not supported"); return -EINVAL; } nla_for_each_nested(treq, req[NL80211_PMSR_REQ_ATTR_DATA], rem) { switch (nla_type(treq)) { case NL80211_PMSR_TYPE_FTM: err = pmsr_parse_ftm(rdev, treq, out, info); break; default: NL_SET_ERR_MSG_ATTR(info->extack, treq, "unsupported measurement type"); err = -EINVAL; } } if (err) return err; return 0; } int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info) { struct nlattr *reqattr = info->attrs[NL80211_ATTR_PEER_MEASUREMENTS]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_pmsr_request *req; struct nlattr *peers, *peer; int count, rem, err, idx; if (!rdev->wiphy.pmsr_capa) return -EOPNOTSUPP; if (!reqattr) return -EINVAL; peers = nla_find(nla_data(reqattr), nla_len(reqattr), NL80211_PMSR_ATTR_PEERS); if (!peers) return -EINVAL; count = 0; nla_for_each_nested(peer, peers, rem) { count++; if (count > rdev->wiphy.pmsr_capa->max_peers) { NL_SET_ERR_MSG_ATTR(info->extack, peer, "Too many peers used"); return -EINVAL; } } req = kzalloc(struct_size(req, peers, count), GFP_KERNEL); if (!req) return -ENOMEM; req->n_peers = count; if (info->attrs[NL80211_ATTR_TIMEOUT]) req->timeout = nla_get_u32(info->attrs[NL80211_ATTR_TIMEOUT]); if (info->attrs[NL80211_ATTR_MAC]) { if (!rdev->wiphy.pmsr_capa->randomize_mac_addr) { NL_SET_ERR_MSG_ATTR(info->extack, info->attrs[NL80211_ATTR_MAC], "device cannot randomize MAC address"); err = -EINVAL; goto out_err; } err = nl80211_parse_random_mac(info->attrs, req->mac_addr, req->mac_addr_mask); if (err) goto out_err; } else { memcpy(req->mac_addr, wdev_address(wdev), ETH_ALEN); eth_broadcast_addr(req->mac_addr_mask); } idx = 0; nla_for_each_nested(peer, peers, rem) { /* NB: this reuses info->attrs, but we no longer need it */ err = pmsr_parse_peer(rdev, peer, &req->peers[idx], info); if (err) goto out_err; idx++; } req->cookie = cfg80211_assign_cookie(rdev); req->nl_portid = info->snd_portid; err = rdev_start_pmsr(rdev, wdev, req); if (err) goto out_err; list_add_tail(&req->list, &wdev->pmsr_list); nl_set_extack_cookie_u64(info->extack, req->cookie); return 0; out_err: kfree(req); return err; } void cfg80211_pmsr_complete(struct wireless_dev *wdev, struct cfg80211_pmsr_request *req, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmsr_request *tmp, *prev, *to_free = NULL; struct sk_buff *msg; void *hdr; trace_cfg80211_pmsr_complete(wdev->wiphy, wdev, req->cookie); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) goto free_request; hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PEER_MEASUREMENT_COMPLETE); if (!hdr) goto free_msg; if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD)) goto free_msg; if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie, NL80211_ATTR_PAD)) goto free_msg; genlmsg_end(msg, hdr); genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid); goto free_request; free_msg: nlmsg_free(msg); free_request: spin_lock_bh(&wdev->pmsr_lock); /* * cfg80211_pmsr_process_abort() may have already moved this request * to the free list, and will free it later. In this case, don't free * it here. */ list_for_each_entry_safe(tmp, prev, &wdev->pmsr_list, list) { if (tmp == req) { list_del(&req->list); to_free = req; break; } } spin_unlock_bh(&wdev->pmsr_lock); kfree(to_free); } EXPORT_SYMBOL_GPL(cfg80211_pmsr_complete); static int nl80211_pmsr_send_ftm_res(struct sk_buff *msg, struct cfg80211_pmsr_result *res) { if (res->status == NL80211_PMSR_STATUS_FAILURE) { if (nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON, res->ftm.failure_reason)) goto error; if (res->ftm.failure_reason == NL80211_PMSR_FTM_FAILURE_PEER_BUSY && res->ftm.busy_retry_time && nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME, res->ftm.busy_retry_time)) goto error; return 0; } #define PUT(tp, attr, val) \ do { \ if (nla_put_##tp(msg, \ NL80211_PMSR_FTM_RESP_ATTR_##attr, \ res->ftm.val)) \ goto error; \ } while (0) #define PUTOPT(tp, attr, val) \ do { \ if (res->ftm.val##_valid) \ PUT(tp, attr, val); \ } while (0) #define PUT_U64(attr, val) \ do { \ if (nla_put_u64_64bit(msg, \ NL80211_PMSR_FTM_RESP_ATTR_##attr,\ res->ftm.val, \ NL80211_PMSR_FTM_RESP_ATTR_PAD)) \ goto error; \ } while (0) #define PUTOPT_U64(attr, val) \ do { \ if (res->ftm.val##_valid) \ PUT_U64(attr, val); \ } while (0) if (res->ftm.burst_index >= 0) PUT(u32, BURST_INDEX, burst_index); PUTOPT(u32, NUM_FTMR_ATTEMPTS, num_ftmr_attempts); PUTOPT(u32, NUM_FTMR_SUCCESSES, num_ftmr_successes); PUT(u8, NUM_BURSTS_EXP, num_bursts_exp); PUT(u8, BURST_DURATION, burst_duration); PUT(u8, FTMS_PER_BURST, ftms_per_burst); PUTOPT(s32, RSSI_AVG, rssi_avg); PUTOPT(s32, RSSI_SPREAD, rssi_spread); if (res->ftm.tx_rate_valid && !nl80211_put_sta_rate(msg, &res->ftm.tx_rate, NL80211_PMSR_FTM_RESP_ATTR_TX_RATE)) goto error; if (res->ftm.rx_rate_valid && !nl80211_put_sta_rate(msg, &res->ftm.rx_rate, NL80211_PMSR_FTM_RESP_ATTR_RX_RATE)) goto error; PUTOPT_U64(RTT_AVG, rtt_avg); PUTOPT_U64(RTT_VARIANCE, rtt_variance); PUTOPT_U64(RTT_SPREAD, rtt_spread); PUTOPT_U64(DIST_AVG, dist_avg); PUTOPT_U64(DIST_VARIANCE, dist_variance); PUTOPT_U64(DIST_SPREAD, dist_spread); if (res->ftm.lci && res->ftm.lci_len && nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_LCI, res->ftm.lci_len, res->ftm.lci)) goto error; if (res->ftm.civicloc && res->ftm.civicloc_len && nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC, res->ftm.civicloc_len, res->ftm.civicloc)) goto error; #undef PUT #undef PUTOPT #undef PUT_U64 #undef PUTOPT_U64 return 0; error: return -ENOSPC; } static int nl80211_pmsr_send_result(struct sk_buff *msg, struct cfg80211_pmsr_result *res) { struct nlattr *pmsr, *peers, *peer, *resp, *data, *typedata; pmsr = nla_nest_start_noflag(msg, NL80211_ATTR_PEER_MEASUREMENTS); if (!pmsr) goto error; peers = nla_nest_start_noflag(msg, NL80211_PMSR_ATTR_PEERS); if (!peers) goto error; peer = nla_nest_start_noflag(msg, 1); if (!peer) goto error; if (nla_put(msg, NL80211_PMSR_PEER_ATTR_ADDR, ETH_ALEN, res->addr)) goto error; resp = nla_nest_start_noflag(msg, NL80211_PMSR_PEER_ATTR_RESP); if (!resp) goto error; if (nla_put_u32(msg, NL80211_PMSR_RESP_ATTR_STATUS, res->status) || nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_HOST_TIME, res->host_time, NL80211_PMSR_RESP_ATTR_PAD)) goto error; if (res->ap_tsf_valid && nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_AP_TSF, res->ap_tsf, NL80211_PMSR_RESP_ATTR_PAD)) goto error; if (res->final && nla_put_flag(msg, NL80211_PMSR_RESP_ATTR_FINAL)) goto error; data = nla_nest_start_noflag(msg, NL80211_PMSR_RESP_ATTR_DATA); if (!data) goto error; typedata = nla_nest_start_noflag(msg, res->type); if (!typedata) goto error; switch (res->type) { case NL80211_PMSR_TYPE_FTM: if (nl80211_pmsr_send_ftm_res(msg, res)) goto error; break; default: WARN_ON(1); } nla_nest_end(msg, typedata); nla_nest_end(msg, data); nla_nest_end(msg, resp); nla_nest_end(msg, peer); nla_nest_end(msg, peers); nla_nest_end(msg, pmsr); return 0; error: return -ENOSPC; } void cfg80211_pmsr_report(struct wireless_dev *wdev, struct cfg80211_pmsr_request *req, struct cfg80211_pmsr_result *result, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct sk_buff *msg; void *hdr; int err; trace_cfg80211_pmsr_report(wdev->wiphy, wdev, req->cookie, result->addr); /* * Currently, only variable items are LCI and civic location, * both of which are reasonably short so we don't need to * worry about them here for the allocation. */ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PEER_MEASUREMENT_RESULT); if (!hdr) goto free; if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD)) goto free; if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie, NL80211_ATTR_PAD)) goto free; err = nl80211_pmsr_send_result(msg, result); if (err) { pr_err_ratelimited("peer measurement result: message didn't fit!"); goto free; } genlmsg_end(msg, hdr); genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid); return; free: nlmsg_free(msg); } EXPORT_SYMBOL_GPL(cfg80211_pmsr_report); static void cfg80211_pmsr_process_abort(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmsr_request *req, *tmp; LIST_HEAD(free_list); lockdep_assert_wiphy(wdev->wiphy); spin_lock_bh(&wdev->pmsr_lock); list_for_each_entry_safe(req, tmp, &wdev->pmsr_list, list) { if (req->nl_portid) continue; list_move_tail(&req->list, &free_list); } spin_unlock_bh(&wdev->pmsr_lock); list_for_each_entry_safe(req, tmp, &free_list, list) { rdev_abort_pmsr(rdev, wdev, req); kfree(req); } } void cfg80211_pmsr_free_wk(struct work_struct *work) { struct wireless_dev *wdev = container_of(work, struct wireless_dev, pmsr_free_wk); wiphy_lock(wdev->wiphy); cfg80211_pmsr_process_abort(wdev); wiphy_unlock(wdev->wiphy); } void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev) { struct cfg80211_pmsr_request *req; bool found = false; spin_lock_bh(&wdev->pmsr_lock); list_for_each_entry(req, &wdev->pmsr_list, list) { found = true; req->nl_portid = 0; } spin_unlock_bh(&wdev->pmsr_lock); if (found) cfg80211_pmsr_process_abort(wdev); WARN_ON(!list_empty(&wdev->pmsr_list)); } void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid) { struct cfg80211_pmsr_request *req; spin_lock_bh(&wdev->pmsr_lock); list_for_each_entry(req, &wdev->pmsr_list, list) { if (req->nl_portid == portid) { req->nl_portid = 0; schedule_work(&wdev->pmsr_free_wk); } } spin_unlock_bh(&wdev->pmsr_lock); }
30 30 28 30 30 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/mutex.h> #include <net/sock.h> #include "nf_internals.h" /* Sockopts only registered and called from user context, so net locking would be overkill. Also, [gs]etsockopt calls may sleep. */ static DEFINE_MUTEX(nf_sockopt_mutex); static LIST_HEAD(nf_sockopts); /* Do exclusive ranges overlap? */ static inline int overlap(int min1, int max1, int min2, int max2) { return max1 > min2 && min1 < max2; } /* Functions to register sockopt ranges (exclusive). */ int nf_register_sockopt(struct nf_sockopt_ops *reg) { struct nf_sockopt_ops *ops; int ret = 0; mutex_lock(&nf_sockopt_mutex); list_for_each_entry(ops, &nf_sockopts, list) { if (ops->pf == reg->pf && (overlap(ops->set_optmin, ops->set_optmax, reg->set_optmin, reg->set_optmax) || overlap(ops->get_optmin, ops->get_optmax, reg->get_optmin, reg->get_optmax))) { pr_debug("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", ops->set_optmin, ops->set_optmax, ops->get_optmin, ops->get_optmax, reg->set_optmin, reg->set_optmax, reg->get_optmin, reg->get_optmax); ret = -EBUSY; goto out; } } list_add(&reg->list, &nf_sockopts); out: mutex_unlock(&nf_sockopt_mutex); return ret; } EXPORT_SYMBOL(nf_register_sockopt); void nf_unregister_sockopt(struct nf_sockopt_ops *reg) { mutex_lock(&nf_sockopt_mutex); list_del(&reg->list); mutex_unlock(&nf_sockopt_mutex); } EXPORT_SYMBOL(nf_unregister_sockopt); static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf, int val, int get) { struct nf_sockopt_ops *ops; mutex_lock(&nf_sockopt_mutex); list_for_each_entry(ops, &nf_sockopts, list) { if (ops->pf == pf) { if (!try_module_get(ops->owner)) goto out_nosup; if (get) { if (val >= ops->get_optmin && val < ops->get_optmax) goto out; } else { if (val >= ops->set_optmin && val < ops->set_optmax) goto out; } module_put(ops->owner); } } out_nosup: ops = ERR_PTR(-ENOPROTOOPT); out: mutex_unlock(&nf_sockopt_mutex); return ops; } int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, sockptr_t opt, unsigned int len) { struct nf_sockopt_ops *ops; int ret; ops = nf_sockopt_find(sk, pf, val, 0); if (IS_ERR(ops)) return PTR_ERR(ops); ret = ops->set(sk, val, opt, len); module_put(ops->owner); return ret; } EXPORT_SYMBOL(nf_setsockopt); int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len) { struct nf_sockopt_ops *ops; int ret; ops = nf_sockopt_find(sk, pf, val, 1); if (IS_ERR(ops)) return PTR_ERR(ops); ret = ops->get(sk, val, opt, len); module_put(ops->owner); return ret; } EXPORT_SYMBOL(nf_getsockopt);
4 916 1134 606 545 142 32 1137 1136 915 387 460 85 563 85 85 85 85 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETFILTER_H #define __LINUX_NETFILTER_H #include <linux/init.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/if.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/static_key.h> #include <linux/module.h> #include <linux/netfilter_defs.h> #include <linux/netdevice.h> #include <linux/sockptr.h> #include <net/net_namespace.h> static inline int NF_DROP_GETERR(int verdict) { return -(verdict >> NF_VERDICT_QBITS); } static __always_inline int NF_DROP_REASON(struct sk_buff *skb, enum skb_drop_reason reason, u32 err) { BUILD_BUG_ON(err > 0xffff); kfree_skb_reason(skb, reason); return ((err << 16) | NF_STOLEN); } static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ul2 = (const unsigned long *)a2; return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL; #else return a1->all[0] == a2->all[0] && a1->all[1] == a2->all[1] && a1->all[2] == a2->all[2] && a1->all[3] == a2->all[3]; #endif } static inline void nf_inet_addr_mask(const union nf_inet_addr *a1, union nf_inet_addr *result, const union nf_inet_addr *mask) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ua = (const unsigned long *)a1; unsigned long *ur = (unsigned long *)result; const unsigned long *um = (const unsigned long *)mask; ur[0] = ua[0] & um[0]; ur[1] = ua[1] & um[1]; #else result->all[0] = a1->all[0] & mask->all[0]; result->all[1] = a1->all[1] & mask->all[1]; result->all[2] = a1->all[2] & mask->all[2]; result->all[3] = a1->all[3] & mask->all[3]; #endif } int netfilter_init(void); struct sk_buff; struct nf_hook_ops; struct sock; struct nf_hook_state { u8 hook; u8 pf; struct net_device *in; struct net_device *out; struct sock *sk; struct net *net; int (*okfn)(struct net *, struct sock *, struct sk_buff *); }; typedef unsigned int nf_hookfn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); enum nf_hook_ops_type { NF_HOOK_OP_UNDEFINED, NF_HOOK_OP_NF_TABLES, NF_HOOK_OP_BPF, }; struct nf_hook_ops { /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; void *priv; u8 pf; enum nf_hook_ops_type hook_ops_type:8; unsigned int hooknum; /* Hooks are ordered in ascending priority. */ int priority; }; struct nf_hook_entry { nf_hookfn *hook; void *priv; }; struct nf_hook_entries_rcu_head { struct rcu_head head; void *allocation; }; struct nf_hook_entries { u16 num_hook_entries; /* padding */ struct nf_hook_entry hooks[]; /* trailer: pointers to original orig_ops of each hook, * followed by rcu_head and scratch space used for freeing * the structure via call_rcu. * * This is not part of struct nf_hook_entry since its only * needed in slow path (hook register/unregister): * const struct nf_hook_ops *orig_ops[] * * For the same reason, we store this at end -- its * only needed when a hook is deleted, not during * packet path processing: * struct nf_hook_entries_rcu_head head */ }; #ifdef CONFIG_NETFILTER static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e) { unsigned int n = e->num_hook_entries; const void *hook_end; hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */ return (struct nf_hook_ops **)hook_end; } static inline int nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb, struct nf_hook_state *state) { return entry->hook(entry->priv, skb, state); } static inline void nf_hook_state_init(struct nf_hook_state *p, unsigned int hook, u_int8_t pf, struct net_device *indev, struct net_device *outdev, struct sock *sk, struct net *net, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { p->hook = hook; p->pf = pf; p->in = indev; p->out = outdev; p->sk = sk; p->net = net; p->okfn = okfn; } struct nf_sockopt_ops { struct list_head list; u_int8_t pf; /* Non-inclusive ranges: use 0/0/NULL to never get called. */ int set_optmin; int set_optmax; int (*set)(struct sock *sk, int optval, sockptr_t arg, unsigned int len); int get_optmin; int get_optmax; int (*get)(struct sock *sk, int optval, void __user *user, int *len); /* Use the module struct to lock set/get code in place */ struct module *owner; }; /* Function to register/unregister hook points. */ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops); int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); /* Functions to register get/setsockopt ranges (non-inclusive). You need to check permissions yourself! */ int nf_register_sockopt(struct nf_sockopt_ops *reg); void nf_unregister_sockopt(struct nf_sockopt_ops *reg); #ifdef CONFIG_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #endif int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *e, unsigned int i); void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e); /** * nf_hook - call a netfilter hook * * Returns 1 if the hook has allowed the packet to pass. The function * okfn must be invoked by the caller in this case. Any other return * value indicates the packet has been consumed by the hook. */ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; int ret = 1; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && !static_key_false(&nf_hooks_needed[pf][hook])) return 1; #endif rcu_read_lock(); switch (pf) { case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); break; case NFPROTO_IPV6: hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); break; case NFPROTO_ARP: #ifdef CONFIG_NETFILTER_FAMILY_ARP if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp))) break; hook_head = rcu_dereference(net->nf.hooks_arp[hook]); #endif break; case NFPROTO_BRIDGE: #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); #endif break; default: WARN_ON_ONCE(1); break; } if (hook_head) { struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, hook_head, 0); } rcu_read_unlock(); return ret; } /* Activate hook; either okfn or kfree_skb called, unless a hook returns NF_STOLEN (in which case, it's up to the hook to deal with the consequences). Returns -ERRNO if packet dropped. Zero means queued, stolen or accepted. */ /* RR: > I don't want nf_hook to return anything because people might forget > about async and trust the return value to mean "packet was ok". AK: Just document it clearly, then you can expect some sense from kernel coders :) */ static inline int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *), bool cond) { int ret; if (!cond || ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1)) ret = okfn(net, sk, skb); return ret; } static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); if (ret == 1) ret = okfn(net, sk, skb); return ret; } static inline void NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && !static_key_false(&nf_hooks_needed[pf][hook])) return; #endif rcu_read_lock(); switch (pf) { case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); break; case NFPROTO_IPV6: hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); break; default: WARN_ON_ONCE(1); break; } if (hook_head) { struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn); nf_hook_slow_list(head, &state, hook_head); } rcu_read_unlock(); } /* Call setsockopt() */ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, sockptr_t opt, unsigned int len); int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); struct flowi; struct nf_queue_entry; __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol, unsigned short family); __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u_int8_t protocol, unsigned short family); int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict, unsigned short family); #include <net/flow.h> struct nf_conn; enum nf_nat_manip_type; struct nlattr; struct nf_nat_hook { int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr); void (*decode_session)(struct sk_buff *skb, struct flowi *fl); void (*remove_nat_bysrc)(struct nf_conn *ct); }; extern const struct nf_nat_hook __rcu *nf_nat_hook; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { #if IS_ENABLED(CONFIG_NF_NAT) const struct nf_nat_hook *nat_hook; rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); if (nat_hook && nat_hook->decode_session) nat_hook->decode_session(skb, fl); rcu_read_unlock(); #endif } #else /* !CONFIG_NETFILTER */ static inline int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *), bool cond) { return okfn(net, sk, skb); } static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { return okfn(net, sk, skb); } static inline void NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { /* nothing to do */ } static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { return 1; } struct flowi; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { } #endif /*CONFIG_NETFILTER*/ #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <linux/netfilter/nf_conntrack_zones_common.h> void nf_ct_attach(struct sk_buff *, const struct sk_buff *); void nf_ct_set_closing(struct nf_conntrack *nfct); struct nf_conntrack_tuple; bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb); #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} static inline void nf_ct_set_closing(struct nf_conntrack *nfct) {} struct nf_conntrack_tuple; static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { return false; } #endif struct nf_conn; enum ip_conntrack_info; struct nf_ct_hook { int (*update)(struct net *net, struct sk_buff *skb); void (*destroy)(struct nf_conntrack *); bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); int (*confirm)(struct sk_buff *skb); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; struct nlattr; struct nfnl_ct_hook { size_t (*build_size)(const struct nf_conn *ct); int (*build)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u_int16_t ct_attr, u_int16_t ct_info_attr); int (*parse)(const struct nlattr *attr, struct nf_conn *ct); int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report); void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off); }; extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook; struct nf_defrag_hook { struct module *owner; int (*enable)(struct net *net); void (*disable)(struct net *net); }; extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook; extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook; /* * nf_skb_duplicated - TEE target has sent a packet * * When a xtables target sends a packet, the OUTPUT and POSTROUTING * hooks are traversed again, i.e. nft and xtables are invoked recursively. * * This is used by xtables TEE target to prevent the duplicated skb from * being duplicated again. */ DECLARE_PER_CPU(bool, nf_skb_duplicated); /* * Contains bitmask of ctnetlink event subscribers, if any. * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag. */ extern u8 nf_ctnetlink_has_listener; #endif /*__LINUX_NETFILTER_H*/
454 453 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 // SPDX-License-Identifier: GPL-2.0 /* * Wakeup statistics in sysfs * * Copyright (c) 2019 Linux Foundation * Copyright (c) 2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2019 Google Inc. */ #include <linux/device.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/slab.h> #include <linux/timekeeping.h> #include "power.h" static struct class *wakeup_class; #define wakeup_attr(_name) \ static ssize_t _name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct wakeup_source *ws = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lu\n", ws->_name); \ } \ static DEVICE_ATTR_RO(_name) wakeup_attr(active_count); wakeup_attr(event_count); wakeup_attr(wakeup_count); wakeup_attr(expire_count); static ssize_t active_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time = ws->active ? ktime_sub(ktime_get(), ws->last_time) : 0; return sysfs_emit(buf, "%lld\n", ktime_to_ms(active_time)); } static DEVICE_ATTR_RO(active_time_ms); static ssize_t total_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t total_time = ws->total_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); total_time = ktime_add(total_time, active_time); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(total_time)); } static DEVICE_ATTR_RO(total_time_ms); static ssize_t max_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t max_time = ws->max_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); if (active_time > max_time) max_time = active_time; } return sysfs_emit(buf, "%lld\n", ktime_to_ms(max_time)); } static DEVICE_ATTR_RO(max_time_ms); static ssize_t last_change_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%lld\n", ktime_to_ms(ws->last_time)); } static DEVICE_ATTR_RO(last_change_ms); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%s\n", ws->name); } static DEVICE_ATTR_RO(name); static ssize_t prevent_suspend_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t prevent_sleep_time = ws->prevent_sleep_time; if (ws->active && ws->autosleep_enabled) { prevent_sleep_time = ktime_add(prevent_sleep_time, ktime_sub(ktime_get(), ws->start_prevent_time)); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(prevent_sleep_time)); } static DEVICE_ATTR_RO(prevent_suspend_time_ms); static struct attribute *wakeup_source_attrs[] = { &dev_attr_name.attr, &dev_attr_active_count.attr, &dev_attr_event_count.attr, &dev_attr_wakeup_count.attr, &dev_attr_expire_count.attr, &dev_attr_active_time_ms.attr, &dev_attr_total_time_ms.attr, &dev_attr_max_time_ms.attr, &dev_attr_last_change_ms.attr, &dev_attr_prevent_suspend_time_ms.attr, NULL, }; ATTRIBUTE_GROUPS(wakeup_source); static void device_create_release(struct device *dev) { kfree(dev); } static struct device *wakeup_source_device_create(struct device *parent, struct wakeup_source *ws) { struct device *dev = NULL; int retval; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { retval = -ENOMEM; goto error; } device_initialize(dev); dev->devt = MKDEV(0, 0); dev->class = wakeup_class; dev->parent = parent; dev->groups = wakeup_source_groups; dev->release = device_create_release; dev_set_drvdata(dev, ws); device_set_pm_not_required(dev); retval = dev_set_name(dev, "wakeup%d", ws->id); if (retval) goto error; retval = device_add(dev); if (retval) goto error; return dev; error: put_device(dev); return ERR_PTR(retval); } /** * wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs. * @parent: Device given wakeup source is associated with (or NULL if virtual). * @ws: Wakeup source to be added in sysfs. */ int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws) { struct device *dev; dev = wakeup_source_device_create(parent, ws); if (IS_ERR(dev)) return PTR_ERR(dev); ws->dev = dev; return 0; } /** * pm_wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs * for a device if they're missing. * @parent: Device given wakeup source is associated with */ int pm_wakeup_source_sysfs_add(struct device *parent) { if (!parent->power.wakeup || parent->power.wakeup->dev) return 0; return wakeup_source_sysfs_add(parent, parent->power.wakeup); } /** * wakeup_source_sysfs_remove - Remove wakeup_source attributes from sysfs. * @ws: Wakeup source to be removed from sysfs. */ void wakeup_source_sysfs_remove(struct wakeup_source *ws) { device_unregister(ws->dev); } static int __init wakeup_sources_sysfs_init(void) { wakeup_class = class_create("wakeup"); return PTR_ERR_OR_ZERO(wakeup_class); } postcore_initcall(wakeup_sources_sysfs_init);
927 3835 6033 3836 3842 401 3837 225 6045 6042 225 3105 2538 726 3107 3733 3734 3726 3726 2503 5089 2217 2216 2213 94 23 115 116 952 955 145 15 15 15 15 4041 4041 3523 1708 266 169 328 169 398 3486 208 855 3334 399 399 2989 3003 399 4042 3835 3835 3835 3835 3840 3836 3839 500 423 201 3838 544 3841 3836 3841 3835 398 399 398 399 399 3836 197 195 198 197 153 74 76 67 26 73 26 72 72 72 75 75 75 500 503 500 503 503 501 2 2 3731 3730 3830 3837 2806 3840 2277 2622 2780 3837 3808 225 225 225 224 3839 3836 9 9 90 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin * Copyright (C) 2012 Konstantin Khlebnikov * Copyright (C) 2016 Intel, Matthew Wilcox * Copyright (C) 2016 Intel, Ross Zwisler */ #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/cpu.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kmemleak.h> #include <linux/percpu.h> #include <linux/preempt.h> /* in_interrupt() */ #include <linux/radix-tree.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/xarray.h> #include "radix-tree.h" /* * Radix tree node cache. */ struct kmem_cache *radix_tree_node_cachep; /* * The radix tree is variable-height, so an insert operation not only has * to build the branch to its corresponding item, it also has to build the * branch to existing items if the size has to be increased (by * radix_tree_extend). * * The worst case is a zero height tree with just a single item at index 0, * and then inserting an item at index ULONG_MAX. This requires 2 new branches * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared. * Hence: */ #define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1) /* * The IDR does not have to be as high as the radix tree since it uses * signed integers, not unsigned longs. */ #define IDR_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(int) - 1) #define IDR_MAX_PATH (DIV_ROUND_UP(IDR_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) #define IDR_PRELOAD_SIZE (IDR_MAX_PATH * 2 - 1) /* * Per-cpu pool of preloaded nodes */ DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { .lock = INIT_LOCAL_LOCK(lock), }; EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads); static inline struct radix_tree_node *entry_to_node(void *ptr) { return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); } static inline void *node_to_entry(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); } #define RADIX_TREE_RETRY XA_RETRY_ENTRY static inline unsigned long get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot) { return parent ? slot - parent->slots : 0; } static unsigned int radix_tree_descend(const struct radix_tree_node *parent, struct radix_tree_node **nodep, unsigned long index) { unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; void __rcu **entry = rcu_dereference_raw(parent->slots[offset]); *nodep = (void *)entry; return offset; } static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) { return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK); } static inline void tag_set(struct radix_tree_node *node, unsigned int tag, int offset) { __set_bit(offset, node->tags[tag]); } static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, int offset) { __clear_bit(offset, node->tags[tag]); } static inline int tag_get(const struct radix_tree_node *node, unsigned int tag, int offset) { return test_bit(offset, node->tags[tag]); } static inline void root_tag_set(struct radix_tree_root *root, unsigned tag) { root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) { root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear_all(struct radix_tree_root *root) { root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1); } static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag) { return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT)); } static inline unsigned root_tags_get(const struct radix_tree_root *root) { return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT; } static inline bool is_idr(const struct radix_tree_root *root) { return !!(root->xa_flags & ROOT_IS_IDR); } /* * Returns 1 if any slot in the node has this tag set. * Otherwise returns 0. */ static inline int any_tag_set(const struct radix_tree_node *node, unsigned int tag) { unsigned idx; for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { if (node->tags[tag][idx]) return 1; } return 0; } static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag) { bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE); } /** * radix_tree_find_next_bit - find the next set bit in a memory region * * @node: where to begin the search * @tag: the tag index * @offset: the bitnumber to start searching at * * Unrollable variant of find_next_bit() for constant size arrays. * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero. * Returns next bit offset, or size if nothing found. */ static __always_inline unsigned long radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, unsigned long offset) { const unsigned long *addr = node->tags[tag]; if (offset < RADIX_TREE_MAP_SIZE) { unsigned long tmp; addr += offset / BITS_PER_LONG; tmp = *addr >> (offset % BITS_PER_LONG); if (tmp) return __ffs(tmp) + offset; offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); while (offset < RADIX_TREE_MAP_SIZE) { tmp = *++addr; if (tmp) return __ffs(tmp) + offset; offset += BITS_PER_LONG; } } return RADIX_TREE_MAP_SIZE; } static unsigned int iter_offset(const struct radix_tree_iter *iter) { return iter->index & RADIX_TREE_MAP_MASK; } /* * The maximum index which can be stored in a radix tree */ static inline unsigned long shift_maxindex(unsigned int shift) { return (RADIX_TREE_MAP_SIZE << shift) - 1; } static inline unsigned long node_maxindex(const struct radix_tree_node *node) { return shift_maxindex(node->shift); } static unsigned long next_index(unsigned long index, const struct radix_tree_node *node, unsigned long offset) { return (index & ~node_maxindex(node)) + (offset << node->shift); } /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. */ static struct radix_tree_node * radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent, struct radix_tree_root *root, unsigned int shift, unsigned int offset, unsigned int count, unsigned int nr_values) { struct radix_tree_node *ret = NULL; /* * Preload code isn't irq safe and it doesn't make sense to use * preloading during an interrupt anyway as all the allocations have * to be atomic. So just do normal allocation when in interrupt. */ if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { struct radix_tree_preload *rtp; /* * Even if the caller has preloaded, try to allocate from the * cache first for the new node to get accounted to the memory * cgroup. */ ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask | __GFP_NOWARN); if (ret) goto out; /* * Provided the caller has preloaded here, we will always * succeed in getting a node here (and never reach * kmem_cache_alloc) */ rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes; rtp->nodes = ret->parent; rtp->nr--; } /* * Update the allocation stack trace as this is more useful * for debugging. */ kmemleak_update_trace(ret); goto out; } ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: BUG_ON(radix_tree_is_internal_node(ret)); if (ret) { ret->shift = shift; ret->offset = offset; ret->count = count; ret->nr_values = nr_values; ret->parent = parent; ret->array = root; } return ret; } void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); /* * Must only free zeroed nodes into the slab. We can be left with * non-NULL entries by radix_tree_free_nodes, so clear the entries * and tags here. */ memset(node->slots, 0, sizeof(node->slots)); memset(node->tags, 0, sizeof(node->tags)); INIT_LIST_HEAD(&node->private_list); kmem_cache_free(radix_tree_node_cachep, node); } static inline void radix_tree_node_free(struct radix_tree_node *node) { call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On * success, return zero, with preemption disabled. On error, return -ENOMEM * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; int ret = -ENOMEM; /* * Nodes preloaded by one cgroup can be used by another cgroup, so * they should never be accounted to any particular memory cgroup. */ gfp_mask &= ~__GFP_ACCOUNT; local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < nr) { local_unlock(&radix_tree_preloads.lock); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr < nr) { node->parent = rtp->nodes; rtp->nodes = node; rtp->nr++; } else { kmem_cache_free(radix_tree_node_cachep, node); } } ret = 0; out: return ret; } /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On * success, return zero, with preemption disabled. On error, return -ENOMEM * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); } EXPORT_SYMBOL(radix_tree_preload); /* * The same as above function, except we don't guarantee preloading happens. * We do it, if we decide it helps. On success, return zero with preemption * disabled. On error, return -ENOMEM with preemption not disabled. */ int radix_tree_maybe_preload(gfp_t gfp_mask) { if (gfpflags_allow_blocking(gfp_mask)) return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ local_lock(&radix_tree_preloads.lock); return 0; } EXPORT_SYMBOL(radix_tree_maybe_preload); static unsigned radix_tree_load_root(const struct radix_tree_root *root, struct radix_tree_node **nodep, unsigned long *maxindex) { struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); *nodep = node; if (likely(radix_tree_is_internal_node(node))) { node = entry_to_node(node); *maxindex = node_maxindex(node); return node->shift + RADIX_TREE_MAP_SHIFT; } *maxindex = 0; return 0; } /* * Extend a radix tree so it can store key @index. */ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, unsigned long index, unsigned int shift) { void *entry; unsigned int maxshift; int tag; /* Figure out what the shift should be. */ maxshift = shift; while (index > shift_maxindex(maxshift)) maxshift += RADIX_TREE_MAP_SHIFT; entry = rcu_dereference_raw(root->xa_head); if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE))) goto out; do { struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL, root, shift, 0, 1, 0); if (!node) return -ENOMEM; if (is_idr(root)) { all_tag_set(node, IDR_FREE); if (!root_tag_get(root, IDR_FREE)) { tag_clear(node, IDR_FREE, 0); root_tag_set(root, IDR_FREE); } } else { /* Propagate the aggregated tag info to the new child */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { if (root_tag_get(root, tag)) tag_set(node, tag, 0); } } BUG_ON(shift > BITS_PER_LONG); if (radix_tree_is_internal_node(entry)) { entry_to_node(entry)->parent = node; } else if (xa_is_value(entry)) { /* Moving a value entry root->xa_head to a node */ node->nr_values = 1; } /* * entry was already in the radix tree, so we do not need * rcu_assign_pointer here */ node->slots[0] = (void __rcu *)entry; entry = node_to_entry(node); rcu_assign_pointer(root->xa_head, entry); shift += RADIX_TREE_MAP_SHIFT; } while (shift <= maxshift); out: return maxshift + RADIX_TREE_MAP_SHIFT; } /** * radix_tree_shrink - shrink radix tree to minimum height * @root: radix tree root */ static inline bool radix_tree_shrink(struct radix_tree_root *root) { bool shrunk = false; for (;;) { struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); struct radix_tree_node *child; if (!radix_tree_is_internal_node(node)) break; node = entry_to_node(node); /* * The candidate node has more than one child, or its child * is not at the leftmost slot, we cannot shrink. */ if (node->count != 1) break; child = rcu_dereference_raw(node->slots[0]); if (!child) break; /* * For an IDR, we must not shrink entry 0 into the root in * case somebody calls idr_replace() with a pointer that * appears to be an internal entry */ if (!node->shift && is_idr(root)) break; if (radix_tree_is_internal_node(child)) entry_to_node(child)->parent = NULL; /* * We don't need rcu_assign_pointer(), since we are simply * moving the node from one part of the tree to another: if it * was safe to dereference the old pointer to it * (node->slots[0]), it will be safe to dereference the new * one (root->xa_head) as far as dependent read barriers go. */ root->xa_head = (void __rcu *)child; if (is_idr(root) && !tag_get(node, IDR_FREE, 0)) root_tag_clear(root, IDR_FREE); /* * We have a dilemma here. The node's slot[0] must not be * NULLed in case there are concurrent lookups expecting to * find the item. However if this was a bottom-level node, * then it may be subject to the slot pointer being visible * to callers dereferencing it. If item corresponding to * slot[0] is subsequently deleted, these callers would expect * their slot to become empty sooner or later. * * For example, lockless pagecache will look up a slot, deref * the page pointer, and if the page has 0 refcount it means it * was concurrently deleted from pagecache so try the deref * again. Fortunately there is already a requirement for logic * to retry the entire slot lookup -- the indirect pointer * problem (replacing direct root node with an indirect pointer * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ node->count = 0; if (!radix_tree_is_internal_node(child)) { node->slots[0] = (void __rcu *)RADIX_TREE_RETRY; } WARN_ON_ONCE(!list_empty(&node->private_list)); radix_tree_node_free(node); shrunk = true; } return shrunk; } static bool delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { bool deleted = false; do { struct radix_tree_node *parent; if (node->count) { if (node_to_entry(node) == rcu_dereference_raw(root->xa_head)) deleted |= radix_tree_shrink(root); return deleted; } parent = node->parent; if (parent) { parent->slots[node->offset] = NULL; parent->count--; } else { /* * Shouldn't the tags already have all been cleared * by the caller? */ if (!is_idr(root)) root_tag_clear_all(root); root->xa_head = NULL; } WARN_ON_ONCE(!list_empty(&node->private_list)); radix_tree_node_free(node); deleted = true; node = parent; } while (node); return deleted; } /** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key * @nodep: returns node * @slotp: returns slot * * Create, if necessary, and return the node and slot for an item * at position @index in the radix tree @root. * * Until there is more than one item in the tree, no nodes are * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. * * Returns -ENOMEM, or 0 for success. */ static int __radix_tree_create(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex; unsigned int shift, offset = 0; unsigned long max = index; gfp_t gfp = root_gfp_mask(root); shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ if (max > maxindex) { int error = radix_tree_extend(root, gfp, max, shift); if (error < 0) return error; shift = error; child = rcu_dereference_raw(root->xa_head); } while (shift > 0) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ child = radix_tree_node_alloc(gfp, node, root, shift, offset, 0, 0); if (!child) return -ENOMEM; rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; } else if (!radix_tree_is_internal_node(child)) break; /* Go a level down */ node = entry_to_node(child); offset = radix_tree_descend(node, &child, index); slot = &node->slots[offset]; } if (nodep) *nodep = node; if (slotp) *slotp = slot; return 0; } /* * Free any nodes below this node. The tree is presumed to not need * shrinking, and any user data in the tree is presumed to not need a * destructor called on it. If we need to add a destructor, we can * add that functionality later. Note that we may not clear tags or * slots from the tree as an RCU walker may still have a pointer into * this subtree. We could replace the entries with RADIX_TREE_RETRY, * but we'll still have to clear those in rcu_free. */ static void radix_tree_free_nodes(struct radix_tree_node *node) { unsigned offset = 0; struct radix_tree_node *child = entry_to_node(node); for (;;) { void *entry = rcu_dereference_raw(child->slots[offset]); if (xa_is_node(entry) && child->shift) { child = entry_to_node(entry); offset = 0; continue; } offset++; while (offset == RADIX_TREE_MAP_SIZE) { struct radix_tree_node *old = child; offset = child->offset + 1; child = child->parent; WARN_ON_ONCE(!list_empty(&old->private_list)); radix_tree_node_free(old); if (old == entry_to_node(node)) return; } } } static inline int insert_entries(struct radix_tree_node *node, void __rcu **slot, void *item) { if (*slot) return -EEXIST; rcu_assign_pointer(*slot, item); if (node) { node->count++; if (xa_is_value(item)) node->nr_values++; } return 1; } /** * radix_tree_insert - insert into a radix tree * @root: radix tree root * @index: index key * @item: item to insert * * Insert an item into the radix tree at position @index. */ int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node; void __rcu **slot; int error; BUG_ON(radix_tree_is_internal_node(item)); error = __radix_tree_create(root, index, &node, &slot); if (error) return error; error = insert_entries(node, slot, item); if (error < 0) return error; if (node) { unsigned offset = get_slot_offset(node, slot); BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); } else { BUG_ON(root_tags_get(root)); } return 0; } EXPORT_SYMBOL(radix_tree_insert); /** * __radix_tree_lookup - lookup an item in a radix tree * @root: radix tree root * @index: index key * @nodep: returns node * @slotp: returns slot * * Lookup and return the item at position @index in the radix * tree @root. * * Until there is more than one item in the tree, no nodes are * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. */ void *__radix_tree_lookup(const struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node, *parent; unsigned long maxindex; void __rcu **slot; restart: parent = NULL; slot = (void __rcu **)&root->xa_head; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); slot = parent->slots + offset; if (node == RADIX_TREE_RETRY) goto restart; if (parent->shift == 0) break; } if (nodep) *nodep = parent; if (slotp) *slotp = slot; return node; } /** * radix_tree_lookup_slot - lookup a slot in a radix tree * @root: radix tree root * @index: index key * * Returns: the slot corresponding to the position @index in the * radix tree @root. This is useful for update-if-exists operations. * * This function can be called under rcu_read_lock iff the slot is not * modified by radix_tree_replace_slot, otherwise it must be called * exclusive from other writers. Any dereference of the slot must be done * using radix_tree_deref_slot. */ void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root, unsigned long index) { void __rcu **slot; if (!__radix_tree_lookup(root, index, NULL, &slot)) return NULL; return slot; } EXPORT_SYMBOL(radix_tree_lookup_slot); /** * radix_tree_lookup - perform lookup operation on a radix tree * @root: radix tree root * @index: index key * * Lookup the item at the position @index in the radix tree @root. * * This function can be called under rcu_read_lock, however the caller * must manage lifetimes of leaf nodes (eg. RCU may also be used to free * them safely). No RCU barriers are required to access or modify the * returned item, however. */ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index) { return __radix_tree_lookup(root, index, NULL, NULL); } EXPORT_SYMBOL(radix_tree_lookup); static void replace_slot(void __rcu **slot, void *item, struct radix_tree_node *node, int count, int values) { if (node && (count || values)) { node->count += count; node->nr_values += values; } rcu_assign_pointer(*slot, item); } static bool node_tag_get(const struct radix_tree_root *root, const struct radix_tree_node *node, unsigned int tag, unsigned int offset) { if (node) return tag_get(node, tag, offset); return root_tag_get(root, tag); } /* * IDR users want to be able to store NULL in the tree, so if the slot isn't * free, don't adjust the count, even if it's transitioning between NULL and * non-NULL. For the IDA, we mark slots as being IDR_FREE while they still * have empty bits, but it only stores NULL in slots when they're being * deleted. */ static int calculate_count(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item, void *old) { if (is_idr(root)) { unsigned offset = get_slot_offset(node, slot); bool free = node_tag_get(root, node, IDR_FREE, offset); if (!free) return 0; if (!old) return 1; } return !!item - !!old; } /** * __radix_tree_replace - replace item in a slot * @root: radix tree root * @node: pointer to tree node * @slot: pointer to slot in @node * @item: new item to store in the slot. * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. */ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item) { void *old = rcu_dereference_raw(*slot); int values = !!xa_is_value(item) - !!xa_is_value(old); int count = calculate_count(root, node, slot, item, old); /* * This function supports replacing value entries and * deleting entries, but that needs accounting against the * node unless the slot is root->xa_head. */ WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) && (count || values)); replace_slot(slot, item, node, count, values); if (!node) return; delete_node(root, node); } /** * radix_tree_replace_slot - replace item in a slot * @root: radix tree root * @slot: pointer to slot * @item: new item to store in the slot. * * For use with radix_tree_lookup_slot() and * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked * across slot lookup and replacement. * * NOTE: This cannot be used to switch between non-entries (empty slots), * regular entries, and value entries, as that requires accounting * inside the radix tree node. When switching from one type of entry or * deleting, use __radix_tree_lookup() and __radix_tree_replace() or * radix_tree_iter_replace(). */ void radix_tree_replace_slot(struct radix_tree_root *root, void __rcu **slot, void *item) { __radix_tree_replace(root, NULL, slot, item); } EXPORT_SYMBOL(radix_tree_replace_slot); /** * radix_tree_iter_replace - replace item in a slot * @root: radix tree root * @iter: iterator state * @slot: pointer to slot * @item: new item to store in the slot. * * For use with radix_tree_for_each_slot(). * Caller must hold tree write locked. */ void radix_tree_iter_replace(struct radix_tree_root *root, const struct radix_tree_iter *iter, void __rcu **slot, void *item) { __radix_tree_replace(root, iter->node, slot, item); } static void node_tag_set(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) { while (node) { if (tag_get(node, tag, offset)) return; tag_set(node, tag, offset); offset = node->offset; node = node->parent; } if (!root_tag_get(root, tag)) root_tag_set(root, tag); } /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index * * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. From * the root all the way down to the leaf node. * * Returns the address of the tagged item. Setting a tag on a not-present * item is a bug. */ void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; radix_tree_load_root(root, &node, &maxindex); BUG_ON(index > maxindex); while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); BUG_ON(!node); if (!tag_get(parent, tag, offset)) tag_set(parent, tag, offset); } /* set the root's tag bit */ if (!root_tag_get(root, tag)) root_tag_set(root, tag); return node; } EXPORT_SYMBOL(radix_tree_tag_set); static void node_tag_clear(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) { while (node) { if (!tag_get(node, tag, offset)) return; tag_clear(node, tag, offset); if (any_tag_set(node, tag)) return; offset = node->offset; node = node->parent; } /* clear the root's tag bit */ if (root_tag_get(root, tag)) root_tag_clear(root, tag); } /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index * * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. If this causes * the leaf node to have no tags set then clear the tag in the * next-to-leaf node, etc. * * Returns the address of the tagged item on success, else NULL. ie: * has the same return value and semantics as radix_tree_lookup(). */ void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; int offset = 0; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; parent = NULL; while (radix_tree_is_internal_node(node)) { parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); } if (node) node_tag_clear(root, parent, tag, offset); return node; } EXPORT_SYMBOL(radix_tree_tag_clear); /** * radix_tree_iter_tag_clear - clear a tag on the current iterator entry * @root: radix tree root * @iter: iterator state * @tag: tag to clear */ void radix_tree_iter_tag_clear(struct radix_tree_root *root, const struct radix_tree_iter *iter, unsigned int tag) { node_tag_clear(root, iter->node, tag, iter_offset(iter)); } /** * radix_tree_tag_get - get a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index (< RADIX_TREE_MAX_TAGS) * * Return values: * * 0: tag not present or not set * 1: tag set * * Note that the return value of this function may not be relied on, even if * the RCU lock is held, unless tag modification and node deletion are excluded * from concurrency. */ int radix_tree_tag_get(const struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; if (!root_tag_get(root, tag)) return 0; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return 0; while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); if (!tag_get(parent, tag, offset)) return 0; if (node == RADIX_TREE_RETRY) break; } return 1; } EXPORT_SYMBOL(radix_tree_tag_get); /* Construct iter->tags bit-mask from node->tags[tag] array */ static void set_iter_tags(struct radix_tree_iter *iter, struct radix_tree_node *node, unsigned offset, unsigned tag) { unsigned tag_long = offset / BITS_PER_LONG; unsigned tag_bit = offset % BITS_PER_LONG; if (!node) { iter->tags = 1; return; } iter->tags = node->tags[tag][tag_long] >> tag_bit; /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ if (tag_long < RADIX_TREE_TAG_LONGS - 1) { /* Pick tags from next element */ if (tag_bit) iter->tags |= node->tags[tag][tag_long + 1] << (BITS_PER_LONG - tag_bit); /* Clip chunk size, here only BITS_PER_LONG tags */ iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); } } void __rcu **radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter) { iter->index = __radix_tree_iter_add(iter, 1); iter->next_index = iter->index; iter->tags = 0; return NULL; } EXPORT_SYMBOL(radix_tree_iter_resume); /** * radix_tree_next_chunk - find next chunk of slots for iteration * * @root: radix tree root * @iter: iterator state * @flags: RADIX_TREE_ITER_* flags and tag index * Returns: pointer to chunk first slot, or NULL if iteration is over */ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned flags) { unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; struct radix_tree_node *node, *child; unsigned long index, offset, maxindex; if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) return NULL; /* * Catch next_index overflow after ~0UL. iter->index never overflows * during iterating; it can be zero only at the beginning. * And we cannot overflow iter->next_index in a single step, * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. * * This condition also used by radix_tree_next_slot() to stop * contiguous iterating, and forbid switching to the next chunk. */ index = iter->next_index; if (!index && iter->index) return NULL; restart: radix_tree_load_root(root, &child, &maxindex); if (index > maxindex) return NULL; if (!child) return NULL; if (!radix_tree_is_internal_node(child)) { /* Single-slot tree */ iter->index = index; iter->next_index = maxindex + 1; iter->tags = 1; iter->node = NULL; return (void __rcu **)&root->xa_head; } do { node = entry_to_node(child); offset = radix_tree_descend(node, &child, index); if ((flags & RADIX_TREE_ITER_TAGGED) ? !tag_get(node, tag, offset) : !child) { /* Hole detected */ if (flags & RADIX_TREE_ITER_CONTIG) return NULL; if (flags & RADIX_TREE_ITER_TAGGED) offset = radix_tree_find_next_bit(node, tag, offset + 1); else while (++offset < RADIX_TREE_MAP_SIZE) { void *slot = rcu_dereference_raw( node->slots[offset]); if (slot) break; } index &= ~node_maxindex(node); index += offset << node->shift; /* Overflow after ~0UL */ if (!index) return NULL; if (offset == RADIX_TREE_MAP_SIZE) goto restart; child = rcu_dereference_raw(node->slots[offset]); } if (!child) goto restart; if (child == RADIX_TREE_RETRY) break; } while (node->shift && radix_tree_is_internal_node(child)); /* Update the iterator state */ iter->index = (index &~ node_maxindex(node)) | offset; iter->next_index = (index | node_maxindex(node)) + 1; iter->node = node; if (flags & RADIX_TREE_ITER_TAGGED) set_iter_tags(iter, node, offset, tag); return node->slots + offset; } EXPORT_SYMBOL(radix_tree_next_chunk); /** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * * Performs an index-ascending scan of the tree for present items. Places * them at *@results and returns the number of items which were placed at * *@results. * * The implementation is naive. * * Like radix_tree_lookup, radix_tree_gang_lookup may be called under * rcu_read_lock. In this case, rather than the returned results being * an atomic snapshot of the tree at a single point in time, the * semantics of an RCU protected gang lookup are as though multiple * radix_tree_lookups have been issued in individual locks, and results * stored in 'results'. */ unsigned int radix_tree_gang_lookup(const struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_slot(slot, root, &iter, first_index) { results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup); /** * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree * based on a tag * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * @tag: the tag index (< RADIX_TREE_MAX_TAGS) * * Performs an index-ascending scan of the tree for present items which * have the tag indexed by @tag set. Places the items at *@results and * returns the number of items which were placed at *@results. */ unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag); /** * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a * radix tree based on a tag * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * @tag: the tag index (< RADIX_TREE_MAX_TAGS) * * Performs an index-ascending scan of the tree for present items which * have the tag indexed by @tag set. Places the slots at *@results and * returns the number of slots which were placed at *@results. */ unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root, void __rcu ***results, unsigned long first_index, unsigned int max_items, unsigned int tag) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { results[ret] = slot; if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); static bool __radix_tree_delete(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot) { void *old = rcu_dereference_raw(*slot); int values = xa_is_value(old) ? -1 : 0; unsigned offset = get_slot_offset(node, slot); int tag; if (is_idr(root)) node_tag_set(root, node, IDR_FREE, offset); else for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) node_tag_clear(root, node, tag, offset); replace_slot(slot, NULL, node, -1, values); return node && delete_node(root, node); } /** * radix_tree_iter_delete - delete the entry at this iterator position * @root: radix tree root * @iter: iterator state * @slot: pointer to slot * * Delete the entry at the position currently pointed to by the iterator. * This may result in the current node being freed; if it is, the iterator * is advanced so that it will not reference the freed memory. This * function may be called without any locking if there are no other threads * which can access this tree. */ void radix_tree_iter_delete(struct radix_tree_root *root, struct radix_tree_iter *iter, void __rcu **slot) { if (__radix_tree_delete(root, iter->node, slot)) iter->index = iter->next_index; } EXPORT_SYMBOL(radix_tree_iter_delete); /** * radix_tree_delete_item - delete an item from a radix tree * @root: radix tree root * @index: index key * @item: expected item * * Remove @item at @index from the radix tree rooted at @root. * * Return: the deleted entry, or %NULL if it was not present * or the entry at the given @index was not @item. */ void *radix_tree_delete_item(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node = NULL; void __rcu **slot = NULL; void *entry; entry = __radix_tree_lookup(root, index, &node, &slot); if (!slot) return NULL; if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE, get_slot_offset(node, slot)))) return NULL; if (item && entry != item) return NULL; __radix_tree_delete(root, node, slot); return entry; } EXPORT_SYMBOL(radix_tree_delete_item); /** * radix_tree_delete - delete an entry from a radix tree * @root: radix tree root * @index: index key * * Remove the entry at @index from the radix tree rooted at @root. * * Return: The deleted entry, or %NULL if it was not present. */ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { return radix_tree_delete_item(root, index, NULL); } EXPORT_SYMBOL(radix_tree_delete); /** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root * @tag: tag to test */ int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag) { return root_tag_get(root, tag); } EXPORT_SYMBOL(radix_tree_tagged); /** * idr_preload - preload for idr_alloc() * @gfp_mask: allocation mask to use for preloading * * Preallocate memory to use for the next call to idr_alloc(). This function * returns with preemption disabled. It will be enabled by idr_preload_end(). */ void idr_preload(gfp_t gfp_mask) { if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE)) local_lock(&radix_tree_preloads.lock); } EXPORT_SYMBOL(idr_preload); void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex, start = iter->next_index; unsigned int shift, offset = 0; grow: shift = radix_tree_load_root(root, &child, &maxindex); if (!radix_tree_tagged(root, IDR_FREE)) start = max(start, maxindex + 1); if (start > max) return ERR_PTR(-ENOSPC); if (start > maxindex) { int error = radix_tree_extend(root, gfp, start, shift); if (error < 0) return ERR_PTR(error); shift = error; child = rcu_dereference_raw(root->xa_head); } if (start == 0 && shift == 0) shift = RADIX_TREE_MAP_SHIFT; while (shift) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ child = radix_tree_node_alloc(gfp, node, root, shift, offset, 0, 0); if (!child) return ERR_PTR(-ENOMEM); all_tag_set(child, IDR_FREE); rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; } else if (!radix_tree_is_internal_node(child)) break; node = entry_to_node(child); offset = radix_tree_descend(node, &child, start); if (!tag_get(node, IDR_FREE, offset)) { offset = radix_tree_find_next_bit(node, IDR_FREE, offset + 1); start = next_index(start, node, offset); if (start > max || start == 0) return ERR_PTR(-ENOSPC); while (offset == RADIX_TREE_MAP_SIZE) { offset = node->offset + 1; node = node->parent; if (!node) goto grow; shift = node->shift; } child = rcu_dereference_raw(node->slots[offset]); } slot = &node->slots[offset]; } iter->index = start; if (node) iter->next_index = 1 + min(max, (start | node_maxindex(node))); else iter->next_index = 1; iter->node = node; set_iter_tags(iter, node, offset, IDR_FREE); return slot; } /** * idr_destroy - release all internal memory from an IDR * @idr: idr handle * * After this function is called, the IDR is empty, and may be reused or * the data structure containing it may be freed. * * A typical clean-up sequence for objects stored in an idr tree will use * idr_for_each() to free all objects, if necessary, then idr_destroy() to * free the memory used to keep track of those objects. */ void idr_destroy(struct idr *idr) { struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head); if (radix_tree_is_internal_node(node)) radix_tree_free_nodes(node); idr->idr_rt.xa_head = NULL; root_tag_set(&idr->idr_rt, IDR_FREE); } EXPORT_SYMBOL(idr_destroy); static void radix_tree_node_ctor(void *arg) { struct radix_tree_node *node = arg; memset(node, 0, sizeof(*node)); INIT_LIST_HEAD(&node->private_list); } static int radix_tree_cpu_dead(unsigned int cpu) { struct radix_tree_preload *rtp; struct radix_tree_node *node; /* Free per-cpu pool of preloaded nodes */ rtp = &per_cpu(radix_tree_preloads, cpu); while (rtp->nr) { node = rtp->nodes; rtp->nodes = node->parent; kmem_cache_free(radix_tree_node_cachep, node); rtp->nr--; } return 0; } void __init radix_tree_init(void) { int ret; BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK); BUILD_BUG_ON(XA_CHUNK_SIZE > 255); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", NULL, radix_tree_cpu_dead); WARN_ON(ret < 0); }
89 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dccp/timer.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/skbuff.h> #include <linux/export.h> #include "dccp.h" /* sysctl variables governing numbers of retransmission attempts */ int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES; int sysctl_dccp_retries1 __read_mostly = TCP_RETR1; int sysctl_dccp_retries2 __read_mostly = TCP_RETR2; static void dccp_write_err(struct sock *sk) { sk->sk_err = READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT; sk_error_report(sk); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_done(sk); __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT); } /* A write timeout has occurred. Process the after effects. */ static int dccp_write_timeout(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { if (icsk->icsk_retransmits != 0) dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : sysctl_dccp_request_retries; } else { if (icsk->icsk_retransmits >= sysctl_dccp_retries1) { /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black hole detection. :-( It is place to make it. It is not made. I do not want to make it. It is disguisting. It does not work in any case. Let me to cite the same draft, which requires for us to implement this: "The one security concern raised by this memo is that ICMP black holes are often caused by over-zealous security administrators who block all ICMP messages. It is vitally important that those who design and deploy security systems understand the impact of strict filtering on upper-layer protocols. The safest web site in the world is worthless if most TCP implementations cannot transfer data from it. It would be far nicer to have all of the black holes fixed rather than fixing all of the TCP implementations." Golden words :-). */ dst_negative_advice(sk); } retry_until = sysctl_dccp_retries2; /* * FIXME: see tcp_write_timout and tcp_out_of_resources */ } if (icsk->icsk_retransmits >= retry_until) { /* Has it gone just too far? */ dccp_write_err(sk); return 1; } return 0; } /* * The DCCP retransmit timer. */ static void dccp_retransmit_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* * More than 4MSL (8 minutes) has passed, a RESET(aborted) was * sent, no need to retransmit, this sock is dead. */ if (dccp_write_timeout(sk)) return; /* * We want to know the number of packets retransmitted, not the * total number of retransmissions of clones of original packets. */ if (icsk->icsk_retransmits == 0) __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS); if (dccp_retransmit_skb(sk) != 0) { /* * Retransmission failed because of local congestion, * do not backoff. */ if (--icsk->icsk_retransmits == 0) icsk->icsk_retransmits = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), DCCP_RTO_MAX); return; } icsk->icsk_backoff++; icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, DCCP_RTO_MAX); if (icsk->icsk_retransmits > sysctl_dccp_retries1) __sk_dst_reset(sk); } static void dccp_write_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; int event = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); goto out; } if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) goto out; if (time_after(icsk->icsk_timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); goto out; } event = icsk->icsk_pending; icsk->icsk_pending = 0; switch (event) { case ICSK_TIME_RETRANS: dccp_retransmit_timer(sk); break; } out: bh_unlock_sock(sk); sock_put(sk); } static void dccp_keepalive_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); pr_err("dccp should not use a keepalive timer !\n"); sock_put(sk); } /* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ static void dccp_delack_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_delack_timer); struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out; } if (sk->sk_state == DCCP_CLOSED || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; if (time_after(icsk->icsk_ack.timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); goto out; } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (inet_csk_ack_scheduled(sk)) { if (!inet_csk_in_pingpong_mode(sk)) { /* Delayed ACK missed: inflate ATO. */ icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } dccp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } out: bh_unlock_sock(sk); sock_put(sk); } /** * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface * @t: pointer to the tasklet associated with this handler * * See the comments above %ccid_dequeueing_decision for supported modes. */ static void dccp_write_xmitlet(struct tasklet_struct *t) { struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet); struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); else dccp_write_xmit(sk); bh_unlock_sock(sk); sock_put(sk); } static void dccp_write_xmit_timer(struct timer_list *t) { struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer); dccp_write_xmitlet(&dp->dccps_xmitlet); } void dccp_init_xmit_timers(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet); timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); } static ktime_t dccp_timestamp_seed; /** * dccp_timestamp - 10s of microseconds time source * Returns the number of 10s of microseconds since loading DCCP. This is native * DCCP time difference format (RFC 4340, sec. 13). * Please note: This will wrap around about circa every 11.9 hours. */ u32 dccp_timestamp(void) { u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); do_div(delta, 10); return delta; } EXPORT_SYMBOL_GPL(dccp_timestamp); void __init dccp_timestamping_init(void) { dccp_timestamp_seed = ktime_get_real(); }
7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 // SPDX-License-Identifier: GPL-2.0 /* * devtmpfs - kernel-maintained tmpfs-based /dev * * Copyright (C) 2009, Kay Sievers <kay.sievers@vrfy.org> * * During bootup, before any driver core device is registered, * devtmpfs, a tmpfs-based filesystem is created. Every driver-core * device which requests a device node, will add a node in this * filesystem. * By default, all devices are named after the name of the device, * owned by root and have a default mode of 0600. Subsystems can * overwrite the default setting if needed. */ #define pr_fmt(fmt) "devtmpfs: " fmt #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/device.h> #include <linux/blkdev.h> #include <linux/namei.h> #include <linux/fs.h> #include <linux/shmem_fs.h> #include <linux/ramfs.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/kthread.h> #include <linux/init_syscalls.h> #include <uapi/linux/mount.h> #include "base.h" #ifdef CONFIG_DEVTMPFS_SAFE #define DEVTMPFS_MFLAGS (MS_SILENT | MS_NOEXEC | MS_NOSUID) #else #define DEVTMPFS_MFLAGS (MS_SILENT) #endif static struct task_struct *thread; static int __initdata mount_dev = IS_ENABLED(CONFIG_DEVTMPFS_MOUNT); static DEFINE_SPINLOCK(req_lock); static struct req { struct req *next; struct completion done; int err; const char *name; umode_t mode; /* 0 => delete */ kuid_t uid; kgid_t gid; struct device *dev; } *requests; static int __init mount_param(char *str) { mount_dev = simple_strtoul(str, NULL, 0); return 1; } __setup("devtmpfs.mount=", mount_param); static struct vfsmount *mnt; static struct dentry *public_dev_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct super_block *s = mnt->mnt_sb; int err; atomic_inc(&s->s_active); down_write(&s->s_umount); err = reconfigure_single(s, flags, data); if (err < 0) { deactivate_locked_super(s); return ERR_PTR(err); } return dget(s->s_root); } static struct file_system_type internal_fs_type = { .name = "devtmpfs", #ifdef CONFIG_TMPFS .init_fs_context = shmem_init_fs_context, #else .init_fs_context = ramfs_init_fs_context, #endif .kill_sb = kill_litter_super, }; static struct file_system_type dev_fs_type = { .name = "devtmpfs", .mount = public_dev_mount, }; static int devtmpfs_submit_req(struct req *req, const char *tmp) { init_completion(&req->done); spin_lock(&req_lock); req->next = requests; requests = req; spin_unlock(&req_lock); wake_up_process(thread); wait_for_completion(&req->done); kfree(tmp); return req->err; } int devtmpfs_create_node(struct device *dev) { const char *tmp = NULL; struct req req; if (!thread) return 0; req.mode = 0; req.uid = GLOBAL_ROOT_UID; req.gid = GLOBAL_ROOT_GID; req.name = device_get_devnode(dev, &req.mode, &req.uid, &req.gid, &tmp); if (!req.name) return -ENOMEM; if (req.mode == 0) req.mode = 0600; if (is_blockdev(dev)) req.mode |= S_IFBLK; else req.mode |= S_IFCHR; req.dev = dev; return devtmpfs_submit_req(&req, tmp); } int devtmpfs_delete_node(struct device *dev) { const char *tmp = NULL; struct req req; if (!thread) return 0; req.name = device_get_devnode(dev, NULL, NULL, NULL, &tmp); if (!req.name) return -ENOMEM; req.mode = 0; req.dev = dev; return devtmpfs_submit_req(&req, tmp); } static int dev_mkdir(const char *name, umode_t mode) { struct dentry *dentry; struct path path; int err; dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); err = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); if (!err) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; done_path_create(&path, dentry); return err; } static int create_path(const char *nodepath) { char *path; char *s; int err = 0; /* parent directories do not exist, create them */ path = kstrdup(nodepath, GFP_KERNEL); if (!path) return -ENOMEM; s = path; for (;;) { s = strchr(s, '/'); if (!s) break; s[0] = '\0'; err = dev_mkdir(path, 0755); if (err && err != -EEXIST) break; s[0] = '/'; s++; } kfree(path); return err; } static int handle_create(const char *nodename, umode_t mode, kuid_t uid, kgid_t gid, struct device *dev) { struct dentry *dentry; struct path path; int err; dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); } if (IS_ERR(dentry)) return PTR_ERR(dentry); err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, dev->devt); if (!err) { struct iattr newattrs; newattrs.ia_mode = mode; newattrs.ia_uid = uid; newattrs.ia_gid = gid; newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID; inode_lock(d_inode(dentry)); notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; } done_path_create(&path, dentry); return err; } static int dev_rmdir(const char *name) { struct path parent; struct dentry *dentry; int err; dentry = kern_path_locked(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_really_is_positive(dentry)) { if (d_inode(dentry)->i_private == &thread) err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry), dentry); else err = -EPERM; } else { err = -ENOENT; } dput(dentry); inode_unlock(d_inode(parent.dentry)); path_put(&parent); return err; } static int delete_path(const char *nodepath) { char *path; int err = 0; path = kstrdup(nodepath, GFP_KERNEL); if (!path) return -ENOMEM; for (;;) { char *base; base = strrchr(path, '/'); if (!base) break; base[0] = '\0'; err = dev_rmdir(path); if (err) break; } kfree(path); return err; } static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *stat) { /* did we create it */ if (inode->i_private != &thread) return 0; /* does the dev_t match */ if (is_blockdev(dev)) { if (!S_ISBLK(stat->mode)) return 0; } else { if (!S_ISCHR(stat->mode)) return 0; } if (stat->rdev != dev->devt) return 0; /* ours */ return 1; } static int handle_remove(const char *nodename, struct device *dev) { struct path parent; struct dentry *dentry; int deleted = 0; int err; dentry = kern_path_locked(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_really_is_positive(dentry)) { struct kstat stat; struct path p = {.mnt = parent.mnt, .dentry = dentry}; err = vfs_getattr(&p, &stat, STATX_TYPE | STATX_MODE, AT_STATX_SYNC_AS_STAT); if (!err && dev_mynode(dev, d_inode(dentry), &stat)) { struct iattr newattrs; /* * before unlinking this node, reset permissions * of possible references like hardlinks */ newattrs.ia_uid = GLOBAL_ROOT_UID; newattrs.ia_gid = GLOBAL_ROOT_GID; newattrs.ia_mode = stat.mode & ~0777; newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; inode_lock(d_inode(dentry)); notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); err = vfs_unlink(&nop_mnt_idmap, d_inode(parent.dentry), dentry, NULL); if (!err || err == -ENOENT) deleted = 1; } } else { err = -ENOENT; } dput(dentry); inode_unlock(d_inode(parent.dentry)); path_put(&parent); if (deleted && strchr(nodename, '/')) delete_path(nodename); return err; } /* * If configured, or requested by the commandline, devtmpfs will be * auto-mounted after the kernel mounted the root filesystem. */ int __init devtmpfs_mount(void) { int err; if (!mount_dev) return 0; if (!thread) return 0; err = init_mount("devtmpfs", "dev", "devtmpfs", DEVTMPFS_MFLAGS, NULL); if (err) pr_info("error mounting %d\n", err); else pr_info("mounted\n"); return err; } static __initdata DECLARE_COMPLETION(setup_done); static int handle(const char *name, umode_t mode, kuid_t uid, kgid_t gid, struct device *dev) { if (mode) return handle_create(name, mode, uid, gid, dev); else return handle_remove(name, dev); } static void __noreturn devtmpfs_work_loop(void) { while (1) { spin_lock(&req_lock); while (requests) { struct req *req = requests; requests = NULL; spin_unlock(&req_lock); while (req) { struct req *next = req->next; req->err = handle(req->name, req->mode, req->uid, req->gid, req->dev); complete(&req->done); req = next; } spin_lock(&req_lock); } __set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&req_lock); schedule(); } } static noinline int __init devtmpfs_setup(void *p) { int err; err = ksys_unshare(CLONE_NEWNS); if (err) goto out; err = init_mount("devtmpfs", "/", "devtmpfs", DEVTMPFS_MFLAGS, NULL); if (err) goto out; init_chdir("/.."); /* will traverse into overmounted root */ init_chroot("."); out: *(int *)p = err; return err; } /* * The __ref is because devtmpfs_setup needs to be __init for the routines it * calls. That call is done while devtmpfs_init, which is marked __init, * synchronously waits for it to complete. */ static int __ref devtmpfsd(void *p) { int err = devtmpfs_setup(p); complete(&setup_done); if (err) return err; devtmpfs_work_loop(); return 0; } /* * Create devtmpfs instance, driver-core devices will add their device * nodes here. */ int __init devtmpfs_init(void) { char opts[] = "mode=0755"; int err; mnt = vfs_kern_mount(&internal_fs_type, 0, "devtmpfs", opts); if (IS_ERR(mnt)) { pr_err("unable to create devtmpfs %ld\n", PTR_ERR(mnt)); return PTR_ERR(mnt); } err = register_filesystem(&dev_fs_type); if (err) { pr_err("unable to register devtmpfs type %d\n", err); return err; } thread = kthread_run(devtmpfsd, &err, "kdevtmpfs"); if (!IS_ERR(thread)) { wait_for_completion(&setup_done); } else { err = PTR_ERR(thread); thread = NULL; } if (err) { pr_err("unable to create devtmpfs %d\n", err); unregister_filesystem(&dev_fs_type); thread = NULL; return err; } pr_info("initialized\n"); return 0; }
12 8 18 18 18 18 35 13 23 11 11 3 11 11 3 11 3 8 13 13 12 7 6 1 1 1 14 1 1 8 3 1 6 5 5 5 5 5 5 5 17 1 1 16 6 5 5 4 1 1 5 5 5 10 1 1 1 6 2 2 2 2 2 2 10 1 9 3 3 1 2 2 1 9 1 1 7 7 11 1 1 2 3 1 3 3 1 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_api.c Packet classifier API. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: * * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/idr.h> #include <linux/jhash.h> #include <linux/rculist.h> #include <linux/rhashtable.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_pedit.h> #include <net/tc_act/tc_mirred.h> #include <net/tc_act/tc_vlan.h> #include <net/tc_act/tc_tunnel_key.h> #include <net/tc_act/tc_csum.h> #include <net/tc_act/tc_gact.h> #include <net/tc_act/tc_police.h> #include <net/tc_act/tc_sample.h> #include <net/tc_act/tc_skbedit.h> #include <net/tc_act/tc_ct.h> #include <net/tc_act/tc_mpls.h> #include <net/tc_act/tc_gate.h> #include <net/flow_offload.h> #include <net/tc_wrapper.h> /* The list of all installed classifier types */ static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); static struct xarray tcf_exts_miss_cookies_xa; struct tcf_exts_miss_cookie_node { const struct tcf_chain *chain; const struct tcf_proto *tp; const struct tcf_exts *exts; u32 chain_index; u32 tp_prio; u32 handle; u32 miss_cookie_base; struct rcu_head rcu; }; /* Each tc action entry cookie will be comprised of 32bit miss_cookie_base + * action index in the exts tc actions array. */ union tcf_exts_miss_cookie { struct { u32 miss_cookie_base; u32 act_index; }; u64 miss_cookie; }; #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static int tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, u32 handle) { struct tcf_exts_miss_cookie_node *n; static u32 next; int err; if (WARN_ON(!handle || !tp->ops->get_exts)) return -EINVAL; n = kzalloc(sizeof(*n), GFP_KERNEL); if (!n) return -ENOMEM; n->chain_index = tp->chain->index; n->chain = tp->chain; n->tp_prio = tp->prio; n->tp = tp; n->exts = exts; n->handle = handle; err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base, n, xa_limit_32b, &next, GFP_KERNEL); if (err) goto err_xa_alloc; exts->miss_cookie_node = n; return 0; err_xa_alloc: kfree(n); return err; } static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts) { struct tcf_exts_miss_cookie_node *n; if (!exts->miss_cookie_node) return; n = exts->miss_cookie_node; xa_erase(&tcf_exts_miss_cookies_xa, n->miss_cookie_base); kfree_rcu(n, rcu); } static struct tcf_exts_miss_cookie_node * tcf_exts_miss_cookie_lookup(u64 miss_cookie, int *act_index) { union tcf_exts_miss_cookie mc = { .miss_cookie = miss_cookie, }; *act_index = mc.act_index; return xa_load(&tcf_exts_miss_cookies_xa, mc.miss_cookie_base); } #else /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */ static int tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, u32 handle) { return 0; } static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts) { } #endif /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */ static u64 tcf_exts_miss_cookie_get(u32 miss_cookie_base, int act_index) { union tcf_exts_miss_cookie mc = { .act_index = act_index, }; if (!miss_cookie_base) return 0; mc.miss_cookie_base = miss_cookie_base; return mc.miss_cookie; } #ifdef CONFIG_NET_CLS_ACT DEFINE_STATIC_KEY_FALSE(tc_skb_ext_tc); EXPORT_SYMBOL(tc_skb_ext_tc); void tc_skb_ext_tc_enable(void) { static_branch_inc(&tc_skb_ext_tc); } EXPORT_SYMBOL(tc_skb_ext_tc_enable); void tc_skb_ext_tc_disable(void) { static_branch_dec(&tc_skb_ext_tc); } EXPORT_SYMBOL(tc_skb_ext_tc_disable); #endif static u32 destroy_obj_hashfn(const struct tcf_proto *tp) { return jhash_3words(tp->chain->index, tp->prio, (__force __u32)tp->protocol, 0); } static void tcf_proto_signal_destroying(struct tcf_chain *chain, struct tcf_proto *tp) { struct tcf_block *block = chain->block; mutex_lock(&block->proto_destroy_lock); hash_add_rcu(block->proto_destroy_ht, &tp->destroy_ht_node, destroy_obj_hashfn(tp)); mutex_unlock(&block->proto_destroy_lock); } static bool tcf_proto_cmp(const struct tcf_proto *tp1, const struct tcf_proto *tp2) { return tp1->chain->index == tp2->chain->index && tp1->prio == tp2->prio && tp1->protocol == tp2->protocol; } static bool tcf_proto_exists_destroying(struct tcf_chain *chain, struct tcf_proto *tp) { u32 hash = destroy_obj_hashfn(tp); struct tcf_proto *iter; bool found = false; rcu_read_lock(); hash_for_each_possible_rcu(chain->block->proto_destroy_ht, iter, destroy_ht_node, hash) { if (tcf_proto_cmp(tp, iter)) { found = true; break; } } rcu_read_unlock(); return found; } static void tcf_proto_signal_destroyed(struct tcf_chain *chain, struct tcf_proto *tp) { struct tcf_block *block = chain->block; mutex_lock(&block->proto_destroy_lock); if (hash_hashed(&tp->destroy_ht_node)) hash_del_rcu(&tp->destroy_ht_node); mutex_unlock(&block->proto_destroy_lock); } /* Find classifier type by string name */ static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind) { const struct tcf_proto_ops *t, *res = NULL; if (kind) { read_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) { if (strcmp(kind, t->kind) == 0) { if (try_module_get(t->owner)) res = t; break; } } read_unlock(&cls_mod_lock); } return res; } static const struct tcf_proto_ops * tcf_proto_lookup_ops(const char *kind, bool rtnl_held, struct netlink_ext_ack *extack) { const struct tcf_proto_ops *ops; ops = __tcf_proto_lookup_ops(kind); if (ops) return ops; #ifdef CONFIG_MODULES if (rtnl_held) rtnl_unlock(); request_module(NET_CLS_ALIAS_PREFIX "%s", kind); if (rtnl_held) rtnl_lock(); ops = __tcf_proto_lookup_ops(kind); /* We dropped the RTNL semaphore in order to perform * the module load. So, even if we succeeded in loading * the module we have to replay the request. We indicate * this using -EAGAIN. */ if (ops) { module_put(ops->owner); return ERR_PTR(-EAGAIN); } #endif NL_SET_ERR_MSG(extack, "TC classifier not found"); return ERR_PTR(-ENOENT); } /* Register(unregister) new classifier type */ int register_tcf_proto_ops(struct tcf_proto_ops *ops) { struct tcf_proto_ops *t; int rc = -EEXIST; write_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) if (!strcmp(ops->kind, t->kind)) goto out; list_add_tail(&ops->head, &tcf_proto_base); rc = 0; out: write_unlock(&cls_mod_lock); return rc; } EXPORT_SYMBOL(register_tcf_proto_ops); static struct workqueue_struct *tc_filter_wq; void unregister_tcf_proto_ops(struct tcf_proto_ops *ops) { struct tcf_proto_ops *t; int rc = -ENOENT; /* Wait for outstanding call_rcu()s, if any, from a * tcf_proto_ops's destroy() handler. */ rcu_barrier(); flush_workqueue(tc_filter_wq); write_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) { if (t == ops) { list_del(&t->head); rc = 0; break; } } write_unlock(&cls_mod_lock); WARN(rc, "unregister tc filter kind(%s) failed %d\n", ops->kind, rc); } EXPORT_SYMBOL(unregister_tcf_proto_ops); bool tcf_queue_work(struct rcu_work *rwork, work_func_t func) { INIT_RCU_WORK(rwork, func); return queue_rcu_work(tc_filter_wq, rwork); } EXPORT_SYMBOL(tcf_queue_work); /* Select new prio value from the range, managed by kernel. */ static inline u32 tcf_auto_prio(struct tcf_proto *tp) { u32 first = TC_H_MAKE(0xC0000000U, 0U); if (tp) first = tp->prio - 1; return TC_H_MAJ(first); } static bool tcf_proto_check_kind(struct nlattr *kind, char *name) { if (kind) return nla_strscpy(name, kind, IFNAMSIZ) < 0; memset(name, 0, IFNAMSIZ); return false; } static bool tcf_proto_is_unlocked(const char *kind) { const struct tcf_proto_ops *ops; bool ret; if (strlen(kind) == 0) return false; ops = tcf_proto_lookup_ops(kind, false, NULL); /* On error return false to take rtnl lock. Proto lookup/create * functions will perform lookup again and properly handle errors. */ if (IS_ERR(ops)) return false; ret = !!(ops->flags & TCF_PROTO_OPS_DOIT_UNLOCKED); module_put(ops->owner); return ret; } static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, u32 prio, struct tcf_chain *chain, bool rtnl_held, struct netlink_ext_ack *extack) { struct tcf_proto *tp; int err; tp = kzalloc(sizeof(*tp), GFP_KERNEL); if (!tp) return ERR_PTR(-ENOBUFS); tp->ops = tcf_proto_lookup_ops(kind, rtnl_held, extack); if (IS_ERR(tp->ops)) { err = PTR_ERR(tp->ops); goto errout; } tp->classify = tp->ops->classify; tp->protocol = protocol; tp->prio = prio; tp->chain = chain; spin_lock_init(&tp->lock); refcount_set(&tp->refcnt, 1); err = tp->ops->init(tp); if (err) { module_put(tp->ops->owner); goto errout; } return tp; errout: kfree(tp); return ERR_PTR(err); } static void tcf_proto_get(struct tcf_proto *tp) { refcount_inc(&tp->refcnt); } static void tcf_maintain_bypass(struct tcf_block *block) { int filtercnt = atomic_read(&block->filtercnt); int skipswcnt = atomic_read(&block->skipswcnt); bool bypass_wanted = filtercnt > 0 && filtercnt == skipswcnt; if (bypass_wanted != block->bypass_wanted) { #ifdef CONFIG_NET_CLS_ACT if (bypass_wanted) static_branch_inc(&tcf_bypass_check_needed_key); else static_branch_dec(&tcf_bypass_check_needed_key); #endif block->bypass_wanted = bypass_wanted; } } static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, bool add) { lockdep_assert_not_held(&block->cb_lock); down_write(&block->cb_lock); if (*counted != add) { if (add) { atomic_inc(&block->filtercnt); *counted = true; } else { atomic_dec(&block->filtercnt); *counted = false; } } tcf_maintain_bypass(block); up_write(&block->cb_lock); } static void tcf_chain_put(struct tcf_chain *chain); static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); tcf_block_filter_cnt_update(tp->chain->block, &tp->counted, false); if (sig_destroy) tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); module_put(tp->ops->owner); kfree_rcu(tp, rcu); } static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { if (refcount_dec_and_test(&tp->refcnt)) tcf_proto_destroy(tp, rtnl_held, true, extack); } static bool tcf_proto_check_delete(struct tcf_proto *tp) { if (tp->ops->delete_empty) return tp->ops->delete_empty(tp); tp->deleting = true; return tp->deleting; } static void tcf_proto_mark_delete(struct tcf_proto *tp) { spin_lock(&tp->lock); tp->deleting = true; spin_unlock(&tp->lock); } static bool tcf_proto_is_deleting(struct tcf_proto *tp) { bool deleting; spin_lock(&tp->lock); deleting = tp->deleting; spin_unlock(&tp->lock); return deleting; } #define ASSERT_BLOCK_LOCKED(block) \ lockdep_assert_held(&(block)->lock) struct tcf_filter_chain_list_item { struct list_head list; tcf_chain_head_change_t *chain_head_change; void *chain_head_change_priv; }; static struct tcf_chain *tcf_chain_create(struct tcf_block *block, u32 chain_index) { struct tcf_chain *chain; ASSERT_BLOCK_LOCKED(block); chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (!chain) return NULL; list_add_tail_rcu(&chain->list, &block->chain_list); mutex_init(&chain->filter_chain_lock); chain->block = block; chain->index = chain_index; chain->refcnt = 1; if (!chain->index) block->chain0.chain = chain; return chain; } static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item, struct tcf_proto *tp_head) { if (item->chain_head_change) item->chain_head_change(tp_head, item->chain_head_change_priv); } static void tcf_chain0_head_change(struct tcf_chain *chain, struct tcf_proto *tp_head) { struct tcf_filter_chain_list_item *item; struct tcf_block *block = chain->block; if (chain->index) return; mutex_lock(&block->lock); list_for_each_entry(item, &block->chain0.filter_chain_list, list) tcf_chain_head_change_item(item, tp_head); mutex_unlock(&block->lock); } /* Returns true if block can be safely freed. */ static bool tcf_chain_detach(struct tcf_chain *chain) { struct tcf_block *block = chain->block; ASSERT_BLOCK_LOCKED(block); list_del_rcu(&chain->list); if (!chain->index) block->chain0.chain = NULL; if (list_empty(&block->chain_list) && refcount_read(&block->refcnt) == 0) return true; return false; } static void tcf_block_destroy(struct tcf_block *block) { mutex_destroy(&block->lock); mutex_destroy(&block->proto_destroy_lock); xa_destroy(&block->ports); kfree_rcu(block, rcu); } static void tcf_chain_destroy(struct tcf_chain *chain, bool free_block) { struct tcf_block *block = chain->block; mutex_destroy(&chain->filter_chain_lock); kfree_rcu(chain, rcu); if (free_block) tcf_block_destroy(block); } static void tcf_chain_hold(struct tcf_chain *chain) { ASSERT_BLOCK_LOCKED(chain->block); ++chain->refcnt; } static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain) { ASSERT_BLOCK_LOCKED(chain->block); /* In case all the references are action references, this * chain should not be shown to the user. */ return chain->refcnt == chain->action_refcnt; } static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, u32 chain_index) { struct tcf_chain *chain; ASSERT_BLOCK_LOCKED(block); list_for_each_entry(chain, &block->chain_list, list) { if (chain->index == chain_index) return chain; } return NULL; } #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static struct tcf_chain *tcf_chain_lookup_rcu(const struct tcf_block *block, u32 chain_index) { struct tcf_chain *chain; list_for_each_entry_rcu(chain, &block->chain_list, list) { if (chain->index == chain_index) return chain; } return NULL; } #endif static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, u32 seq, u16 flags, int event, bool unicast, struct netlink_ext_ack *extack); static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create, bool by_act) { struct tcf_chain *chain = NULL; bool is_first_reference; mutex_lock(&block->lock); chain = tcf_chain_lookup(block, chain_index); if (chain) { tcf_chain_hold(chain); } else { if (!create) goto errout; chain = tcf_chain_create(block, chain_index); if (!chain) goto errout; } if (by_act) ++chain->action_refcnt; is_first_reference = chain->refcnt - chain->action_refcnt == 1; mutex_unlock(&block->lock); /* Send notification only in case we got the first * non-action reference. Until then, the chain acts only as * a placeholder for actions pointing to it and user ought * not know about them. */ if (is_first_reference && !by_act) tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, RTM_NEWCHAIN, false, NULL); return chain; errout: mutex_unlock(&block->lock); return chain; } static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create) { return __tcf_chain_get(block, chain_index, create, false); } struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) { return __tcf_chain_get(block, chain_index, true, true); } EXPORT_SYMBOL(tcf_chain_get_by_act); static void tc_chain_tmplt_del(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv); static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct tcf_block *block, struct sk_buff *oskb, u32 seq, u16 flags); static void __tcf_chain_put(struct tcf_chain *chain, bool by_act, bool explicitly_created) { struct tcf_block *block = chain->block; const struct tcf_proto_ops *tmplt_ops; unsigned int refcnt, non_act_refcnt; bool free_block = false; void *tmplt_priv; mutex_lock(&block->lock); if (explicitly_created) { if (!chain->explicitly_created) { mutex_unlock(&block->lock); return; } chain->explicitly_created = false; } if (by_act) chain->action_refcnt--; /* tc_chain_notify_delete can't be called while holding block lock. * However, when block is unlocked chain can be changed concurrently, so * save these to temporary variables. */ refcnt = --chain->refcnt; non_act_refcnt = refcnt - chain->action_refcnt; tmplt_ops = chain->tmplt_ops; tmplt_priv = chain->tmplt_priv; if (non_act_refcnt == chain->explicitly_created && !by_act) { if (non_act_refcnt == 0) tc_chain_notify_delete(tmplt_ops, tmplt_priv, chain->index, block, NULL, 0, 0); /* Last reference to chain, no need to lock. */ chain->flushing = false; } if (refcnt == 0) free_block = tcf_chain_detach(chain); mutex_unlock(&block->lock); if (refcnt == 0) { tc_chain_tmplt_del(tmplt_ops, tmplt_priv); tcf_chain_destroy(chain, free_block); } } static void tcf_chain_put(struct tcf_chain *chain) { __tcf_chain_put(chain, false, false); } void tcf_chain_put_by_act(struct tcf_chain *chain) { __tcf_chain_put(chain, true, false); } EXPORT_SYMBOL(tcf_chain_put_by_act); static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) { __tcf_chain_put(chain, false, true); } static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) { struct tcf_proto *tp, *tp_next; mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_dereference(chain->filter_chain, chain); while (tp) { tp_next = rcu_dereference_protected(tp->next, 1); tcf_proto_signal_destroying(chain, tp); tp = tp_next; } tp = tcf_chain_dereference(chain->filter_chain, chain); RCU_INIT_POINTER(chain->filter_chain, NULL); tcf_chain0_head_change(chain, NULL); chain->flushing = true; mutex_unlock(&chain->filter_chain_lock); while (tp) { tp_next = rcu_dereference_protected(tp->next, 1); tcf_proto_put(tp, rtnl_held, NULL); tp = tp_next; } } static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo); static void tcf_block_offload_init(struct flow_block_offload *bo, struct net_device *dev, struct Qdisc *sch, enum flow_block_command command, enum flow_block_binder_type binder_type, struct flow_block *flow_block, bool shared, struct netlink_ext_ack *extack) { bo->net = dev_net(dev); bo->command = command; bo->binder_type = binder_type; bo->block = flow_block; bo->block_shared = shared; bo->extack = extack; bo->sch = sch; bo->cb_list_head = &flow_block->cb_list; INIT_LIST_HEAD(&bo->cb_list); } static void tcf_block_unbind(struct tcf_block *block, struct flow_block_offload *bo); static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) { struct tcf_block *block = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct Qdisc *sch = block_cb->indr.sch; struct netlink_ext_ack extack = {}; struct flow_block_offload bo = {}; tcf_block_offload_init(&bo, dev, sch, FLOW_BLOCK_UNBIND, block_cb->indr.binder_type, &block->flow_block, tcf_block_shared(block), &extack); rtnl_lock(); down_write(&block->cb_lock); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); tcf_block_unbind(block, &bo); up_write(&block->cb_lock); rtnl_unlock(); } static bool tcf_block_offload_in_use(struct tcf_block *block) { return atomic_read(&block->offloadcnt); } static int tcf_block_offload_cmd(struct tcf_block *block, struct net_device *dev, struct Qdisc *sch, struct tcf_block_ext_info *ei, enum flow_block_command command, struct netlink_ext_ack *extack) { struct flow_block_offload bo = {}; tcf_block_offload_init(&bo, dev, sch, command, ei->binder_type, &block->flow_block, tcf_block_shared(block), extack); if (dev->netdev_ops->ndo_setup_tc) { int err; err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) { if (err != -EOPNOTSUPP) NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); return err; } return tcf_block_setup(block, &bo); } flow_indr_dev_setup_offload(dev, sch, TC_SETUP_BLOCK, block, &bo, tc_block_indr_cleanup); tcf_block_setup(block, &bo); return -EOPNOTSUPP; } static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { struct net_device *dev = q->dev_queue->dev; int err; down_write(&block->cb_lock); /* If tc offload feature is disabled and the block we try to bind * to already has some offloaded filters, forbid to bind. */ if (dev->netdev_ops->ndo_setup_tc && !tc_can_offload(dev) && tcf_block_offload_in_use(block)) { NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); err = -EOPNOTSUPP; goto err_unlock; } err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; if (err) goto err_unlock; up_write(&block->cb_lock); return 0; no_offload_dev_inc: if (tcf_block_offload_in_use(block)) goto err_unlock; err = 0; block->nooffloaddevcnt++; err_unlock: up_write(&block->cb_lock); return err; } static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { struct net_device *dev = q->dev_queue->dev; int err; down_write(&block->cb_lock); err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; up_write(&block->cb_lock); return; no_offload_dev_dec: WARN_ON(block->nooffloaddevcnt-- == 0); up_write(&block->cb_lock); } static int tcf_chain0_head_change_cb_add(struct tcf_block *block, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { struct tcf_filter_chain_list_item *item; struct tcf_chain *chain0; item = kmalloc(sizeof(*item), GFP_KERNEL); if (!item) { NL_SET_ERR_MSG(extack, "Memory allocation for head change callback item failed"); return -ENOMEM; } item->chain_head_change = ei->chain_head_change; item->chain_head_change_priv = ei->chain_head_change_priv; mutex_lock(&block->lock); chain0 = block->chain0.chain; if (chain0) tcf_chain_hold(chain0); else list_add(&item->list, &block->chain0.filter_chain_list); mutex_unlock(&block->lock); if (chain0) { struct tcf_proto *tp_head; mutex_lock(&chain0->filter_chain_lock); tp_head = tcf_chain_dereference(chain0->filter_chain, chain0); if (tp_head) tcf_chain_head_change_item(item, tp_head); mutex_lock(&block->lock); list_add(&item->list, &block->chain0.filter_chain_list); mutex_unlock(&block->lock); mutex_unlock(&chain0->filter_chain_lock); tcf_chain_put(chain0); } return 0; } static void tcf_chain0_head_change_cb_del(struct tcf_block *block, struct tcf_block_ext_info *ei) { struct tcf_filter_chain_list_item *item; mutex_lock(&block->lock); list_for_each_entry(item, &block->chain0.filter_chain_list, list) { if ((!ei->chain_head_change && !ei->chain_head_change_priv) || (item->chain_head_change == ei->chain_head_change && item->chain_head_change_priv == ei->chain_head_change_priv)) { if (block->chain0.chain) tcf_chain_head_change_item(item, NULL); list_del(&item->list); mutex_unlock(&block->lock); kfree(item); return; } } mutex_unlock(&block->lock); WARN_ON(1); } struct tcf_net { spinlock_t idr_lock; /* Protects idr */ struct idr idr; }; static unsigned int tcf_net_id; static int tcf_block_insert(struct tcf_block *block, struct net *net, struct netlink_ext_ack *extack) { struct tcf_net *tn = net_generic(net, tcf_net_id); int err; idr_preload(GFP_KERNEL); spin_lock(&tn->idr_lock); err = idr_alloc_u32(&tn->idr, block, &block->index, block->index, GFP_NOWAIT); spin_unlock(&tn->idr_lock); idr_preload_end(); return err; } static void tcf_block_remove(struct tcf_block *block, struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); spin_lock(&tn->idr_lock); idr_remove(&tn->idr, block->index); spin_unlock(&tn->idr_lock); } static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, u32 block_index, struct netlink_ext_ack *extack) { struct tcf_block *block; block = kzalloc(sizeof(*block), GFP_KERNEL); if (!block) { NL_SET_ERR_MSG(extack, "Memory allocation for block failed"); return ERR_PTR(-ENOMEM); } mutex_init(&block->lock); mutex_init(&block->proto_destroy_lock); init_rwsem(&block->cb_lock); flow_block_init(&block->flow_block); INIT_LIST_HEAD(&block->chain_list); INIT_LIST_HEAD(&block->owner_list); INIT_LIST_HEAD(&block->chain0.filter_chain_list); refcount_set(&block->refcnt, 1); block->net = net; block->index = block_index; xa_init(&block->ports); /* Don't store q pointer for blocks which are shared */ if (!tcf_block_shared(block)) block->q = q; return block; } struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) { struct tcf_net *tn = net_generic(net, tcf_net_id); return idr_find(&tn->idr, block_index); } EXPORT_SYMBOL(tcf_block_lookup); static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index) { struct tcf_block *block; rcu_read_lock(); block = tcf_block_lookup(net, block_index); if (block && !refcount_inc_not_zero(&block->refcnt)) block = NULL; rcu_read_unlock(); return block; } static struct tcf_chain * __tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain) { mutex_lock(&block->lock); if (chain) chain = list_is_last(&chain->list, &block->chain_list) ? NULL : list_next_entry(chain, list); else chain = list_first_entry_or_null(&block->chain_list, struct tcf_chain, list); /* skip all action-only chains */ while (chain && tcf_chain_held_by_acts_only(chain)) chain = list_is_last(&chain->list, &block->chain_list) ? NULL : list_next_entry(chain, list); if (chain) tcf_chain_hold(chain); mutex_unlock(&block->lock); return chain; } /* Function to be used by all clients that want to iterate over all chains on * block. It properly obtains block->lock and takes reference to chain before * returning it. Users of this function must be tolerant to concurrent chain * insertion/deletion or ensure that no concurrent chain modification is * possible. Note that all netlink dump callbacks cannot guarantee to provide * consistent dump because rtnl lock is released each time skb is filled with * data and sent to user-space. */ struct tcf_chain * tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain) { struct tcf_chain *chain_next = __tcf_get_next_chain(block, chain); if (chain) tcf_chain_put(chain); return chain_next; } EXPORT_SYMBOL(tcf_get_next_chain); static struct tcf_proto * __tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp) { u32 prio = 0; ASSERT_RTNL(); mutex_lock(&chain->filter_chain_lock); if (!tp) { tp = tcf_chain_dereference(chain->filter_chain, chain); } else if (tcf_proto_is_deleting(tp)) { /* 'deleting' flag is set and chain->filter_chain_lock was * unlocked, which means next pointer could be invalid. Restart * search. */ prio = tp->prio + 1; tp = tcf_chain_dereference(chain->filter_chain, chain); for (; tp; tp = tcf_chain_dereference(tp->next, chain)) if (!tp->deleting && tp->prio >= prio) break; } else { tp = tcf_chain_dereference(tp->next, chain); } if (tp) tcf_proto_get(tp); mutex_unlock(&chain->filter_chain_lock); return tp; } /* Function to be used by all clients that want to iterate over all tp's on * chain. Users of this function must be tolerant to concurrent tp * insertion/deletion or ensure that no concurrent chain modification is * possible. Note that all netlink dump callbacks cannot guarantee to provide * consistent dump because rtnl lock is released each time skb is filled with * data and sent to user-space. */ struct tcf_proto * tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp) { struct tcf_proto *tp_next = __tcf_get_next_proto(chain, tp); if (tp) tcf_proto_put(tp, true, NULL); return tp_next; } EXPORT_SYMBOL(tcf_get_next_proto); static void tcf_block_flush_all_chains(struct tcf_block *block, bool rtnl_held) { struct tcf_chain *chain; /* Last reference to block. At this point chains cannot be added or * removed concurrently. */ for (chain = tcf_get_next_chain(block, NULL); chain; chain = tcf_get_next_chain(block, chain)) { tcf_chain_put_explicitly_created(chain); tcf_chain_flush(chain, rtnl_held); } } /* Lookup Qdisc and increments its reference counter. * Set parent, if necessary. */ static int __tcf_qdisc_find(struct net *net, struct Qdisc **q, u32 *parent, int ifindex, bool rtnl_held, struct netlink_ext_ack *extack) { const struct Qdisc_class_ops *cops; struct net_device *dev; int err = 0; if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) return 0; rcu_read_lock(); /* Find link */ dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { rcu_read_unlock(); return -ENODEV; } /* Find qdisc */ if (!*parent) { *q = rcu_dereference(dev->qdisc); *parent = (*q)->handle; } else { *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); if (!*q) { NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); err = -EINVAL; goto errout_rcu; } } *q = qdisc_refcount_inc_nz(*q); if (!*q) { NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); err = -EINVAL; goto errout_rcu; } /* Is it classful? */ cops = (*q)->ops->cl_ops; if (!cops) { NL_SET_ERR_MSG(extack, "Qdisc not classful"); err = -EINVAL; goto errout_qdisc; } if (!cops->tcf_block) { NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); err = -EOPNOTSUPP; goto errout_qdisc; } errout_rcu: /* At this point we know that qdisc is not noop_qdisc, * which means that qdisc holds a reference to net_device * and we hold a reference to qdisc, so it is safe to release * rcu read lock. */ rcu_read_unlock(); return err; errout_qdisc: rcu_read_unlock(); if (rtnl_held) qdisc_put(*q); else qdisc_put_unlocked(*q); *q = NULL; return err; } static int __tcf_qdisc_cl_find(struct Qdisc *q, u32 parent, unsigned long *cl, int ifindex, struct netlink_ext_ack *extack) { if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) return 0; /* Do we search for filter, attached to class? */ if (TC_H_MIN(parent)) { const struct Qdisc_class_ops *cops = q->ops->cl_ops; *cl = cops->find(q, parent); if (*cl == 0) { NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); return -ENOENT; } } return 0; } static struct tcf_block *__tcf_block_find(struct net *net, struct Qdisc *q, unsigned long cl, int ifindex, u32 block_index, struct netlink_ext_ack *extack) { struct tcf_block *block; if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_refcnt_get(net, block_index); if (!block) { NL_SET_ERR_MSG(extack, "Block of given index was not found"); return ERR_PTR(-EINVAL); } } else { const struct Qdisc_class_ops *cops = q->ops->cl_ops; block = cops->tcf_block(q, cl, extack); if (!block) return ERR_PTR(-EINVAL); if (tcf_block_shared(block)) { NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); return ERR_PTR(-EOPNOTSUPP); } /* Always take reference to block in order to support execution * of rules update path of cls API without rtnl lock. Caller * must release block when it is finished using it. 'if' block * of this conditional obtain reference to block by calling * tcf_block_refcnt_get(). */ refcount_inc(&block->refcnt); } return block; } static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei, bool rtnl_held) { if (refcount_dec_and_mutex_lock(&block->refcnt, &block->lock)) { /* Flushing/putting all chains will cause the block to be * deallocated when last chain is freed. However, if chain_list * is empty, block has to be manually deallocated. After block * reference counter reached 0, it is no longer possible to * increment it or add new chains to block. */ bool free_block = list_empty(&block->chain_list); mutex_unlock(&block->lock); if (tcf_block_shared(block)) tcf_block_remove(block, block->net); if (q) tcf_block_offload_unbind(block, q, ei); if (free_block) tcf_block_destroy(block); else tcf_block_flush_all_chains(block, rtnl_held); } else if (q) { tcf_block_offload_unbind(block, q, ei); } } static void tcf_block_refcnt_put(struct tcf_block *block, bool rtnl_held) { __tcf_block_put(block, NULL, NULL, rtnl_held); } /* Find tcf block. * Set q, parent, cl when appropriate. */ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, u32 *parent, unsigned long *cl, int ifindex, u32 block_index, struct netlink_ext_ack *extack) { struct tcf_block *block; int err = 0; ASSERT_RTNL(); err = __tcf_qdisc_find(net, q, parent, ifindex, true, extack); if (err) goto errout; err = __tcf_qdisc_cl_find(*q, *parent, cl, ifindex, extack); if (err) goto errout_qdisc; block = __tcf_block_find(net, *q, *cl, ifindex, block_index, extack); if (IS_ERR(block)) { err = PTR_ERR(block); goto errout_qdisc; } return block; errout_qdisc: if (*q) qdisc_put(*q); errout: *q = NULL; return ERR_PTR(err); } static void tcf_block_release(struct Qdisc *q, struct tcf_block *block, bool rtnl_held) { if (!IS_ERR_OR_NULL(block)) tcf_block_refcnt_put(block, rtnl_held); if (q) { if (rtnl_held) qdisc_put(q); else qdisc_put_unlocked(q); } } struct tcf_block_owner_item { struct list_head list; struct Qdisc *q; enum flow_block_binder_type binder_type; }; static void tcf_block_owner_netif_keep_dst(struct tcf_block *block, struct Qdisc *q, enum flow_block_binder_type binder_type) { if (block->keep_dst && binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS && binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) netif_keep_dst(qdisc_dev(q)); } void tcf_block_netif_keep_dst(struct tcf_block *block) { struct tcf_block_owner_item *item; block->keep_dst = true; list_for_each_entry(item, &block->owner_list, list) tcf_block_owner_netif_keep_dst(block, item->q, item->binder_type); } EXPORT_SYMBOL(tcf_block_netif_keep_dst); static int tcf_block_owner_add(struct tcf_block *block, struct Qdisc *q, enum flow_block_binder_type binder_type) { struct tcf_block_owner_item *item; item = kmalloc(sizeof(*item), GFP_KERNEL); if (!item) return -ENOMEM; item->q = q; item->binder_type = binder_type; list_add(&item->list, &block->owner_list); return 0; } static void tcf_block_owner_del(struct tcf_block *block, struct Qdisc *q, enum flow_block_binder_type binder_type) { struct tcf_block_owner_item *item; list_for_each_entry(item, &block->owner_list, list) { if (item->q == q && item->binder_type == binder_type) { list_del(&item->list); kfree(item); return; } } WARN_ON(1); } static bool tcf_block_tracks_dev(struct tcf_block *block, struct tcf_block_ext_info *ei) { return tcf_block_shared(block) && (ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS || ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS); } int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(q); struct net *net = qdisc_net(q); struct tcf_block *block = NULL; int err; if (ei->block_index) /* block_index not 0 means the shared block is requested */ block = tcf_block_refcnt_get(net, ei->block_index); if (!block) { block = tcf_block_create(net, q, ei->block_index, extack); if (IS_ERR(block)) return PTR_ERR(block); if (tcf_block_shared(block)) { err = tcf_block_insert(block, net, extack); if (err) goto err_block_insert; } } err = tcf_block_owner_add(block, q, ei->binder_type); if (err) goto err_block_owner_add; tcf_block_owner_netif_keep_dst(block, q, ei->binder_type); err = tcf_chain0_head_change_cb_add(block, ei, extack); if (err) goto err_chain0_head_change_cb_add; err = tcf_block_offload_bind(block, q, ei, extack); if (err) goto err_block_offload_bind; if (tcf_block_tracks_dev(block, ei)) { err = xa_insert(&block->ports, dev->ifindex, dev, GFP_KERNEL); if (err) { NL_SET_ERR_MSG(extack, "block dev insert failed"); goto err_dev_insert; } } *p_block = block; return 0; err_dev_insert: tcf_block_offload_unbind(block, q, ei); err_block_offload_bind: tcf_chain0_head_change_cb_del(block, ei); err_chain0_head_change_cb_add: tcf_block_owner_del(block, q, ei->binder_type); err_block_owner_add: err_block_insert: tcf_block_refcnt_put(block, true); return err; } EXPORT_SYMBOL(tcf_block_get_ext); static void tcf_chain_head_change_dflt(struct tcf_proto *tp_head, void *priv) { struct tcf_proto __rcu **p_filter_chain = priv; rcu_assign_pointer(*p_filter_chain, tp_head); } int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack) { struct tcf_block_ext_info ei = { .chain_head_change = tcf_chain_head_change_dflt, .chain_head_change_priv = p_filter_chain, }; WARN_ON(!p_filter_chain); return tcf_block_get_ext(p_block, q, &ei, extack); } EXPORT_SYMBOL(tcf_block_get); /* XXX: Standalone actions are not allowed to jump to any chain, and bound * actions should be all removed after flushing. */ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { struct net_device *dev = qdisc_dev(q); if (!block) return; if (tcf_block_tracks_dev(block, ei)) xa_erase(&block->ports, dev->ifindex); tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); __tcf_block_put(block, q, ei, true); } EXPORT_SYMBOL(tcf_block_put_ext); void tcf_block_put(struct tcf_block *block) { struct tcf_block_ext_info ei = {0, }; if (!block) return; tcf_block_put_ext(block, block->q, &ei); } EXPORT_SYMBOL(tcf_block_put); static int tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb, void *cb_priv, bool add, bool offload_in_use, struct netlink_ext_ack *extack) { struct tcf_chain *chain, *chain_prev; struct tcf_proto *tp, *tp_prev; int err; lockdep_assert_held(&block->cb_lock); for (chain = __tcf_get_next_chain(block, NULL); chain; chain_prev = chain, chain = __tcf_get_next_chain(block, chain), tcf_chain_put(chain_prev)) { if (chain->tmplt_ops && add) chain->tmplt_ops->tmplt_reoffload(chain, true, cb, cb_priv); for (tp = __tcf_get_next_proto(chain, NULL); tp; tp_prev = tp, tp = __tcf_get_next_proto(chain, tp), tcf_proto_put(tp_prev, true, NULL)) { if (tp->ops->reoffload) { err = tp->ops->reoffload(tp, add, cb, cb_priv, extack); if (err && add) goto err_playback_remove; } else if (add && offload_in_use) { err = -EOPNOTSUPP; NL_SET_ERR_MSG(extack, "Filter HW offload failed - classifier without re-offloading support"); goto err_playback_remove; } } if (chain->tmplt_ops && !add) chain->tmplt_ops->tmplt_reoffload(chain, false, cb, cb_priv); } return 0; err_playback_remove: tcf_proto_put(tp, true, NULL); tcf_chain_put(chain); tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use, extack); return err; } static int tcf_block_bind(struct tcf_block *block, struct flow_block_offload *bo) { struct flow_block_cb *block_cb, *next; int err, i = 0; lockdep_assert_held(&block->cb_lock); list_for_each_entry(block_cb, &bo->cb_list, list) { err = tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, true, tcf_block_offload_in_use(block), bo->extack); if (err) goto err_unroll; if (!bo->unlocked_driver_cb) block->lockeddevcnt++; i++; } list_splice(&bo->cb_list, &block->flow_block.cb_list); return 0; err_unroll: list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->driver_list); if (i-- > 0) { list_del(&block_cb->list); tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, false, tcf_block_offload_in_use(block), NULL); if (!bo->unlocked_driver_cb) block->lockeddevcnt--; } flow_block_cb_free(block_cb); } return err; } static void tcf_block_unbind(struct tcf_block *block, struct flow_block_offload *bo) { struct flow_block_cb *block_cb, *next; lockdep_assert_held(&block->cb_lock); list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, false, tcf_block_offload_in_use(block), NULL); list_del(&block_cb->list); flow_block_cb_free(block_cb); if (!bo->unlocked_driver_cb) block->lockeddevcnt--; } } static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo) { int err; switch (bo->command) { case FLOW_BLOCK_BIND: err = tcf_block_bind(block, bo); break; case FLOW_BLOCK_UNBIND: err = 0; tcf_block_unbind(block, bo); break; default: WARN_ON_ONCE(1); err = -EOPNOTSUPP; } return err; } /* Main classifier routine: scans classifier chain attached * to this qdisc, (optionally) tests for protocol and asks * specific classifiers. */ static inline int __tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, const struct tcf_proto *orig_tp, struct tcf_result *res, bool compat_mode, struct tcf_exts_miss_cookie_node *n, int act_index, u32 *last_executed_chain) { #ifdef CONFIG_NET_CLS_ACT const int max_reclassify_loop = 16; const struct tcf_proto *first_tp; int limit = 0; reclassify: #endif for (; tp; tp = rcu_dereference_bh(tp->next)) { __be16 protocol = skb_protocol(skb, false); int err = 0; if (n) { struct tcf_exts *exts; if (n->tp_prio != tp->prio) continue; /* We re-lookup the tp and chain based on index instead * of having hard refs and locks to them, so do a sanity * check if any of tp,chain,exts was replaced by the * time we got here with a cookie from hardware. */ if (unlikely(n->tp != tp || n->tp->chain != n->chain || !tp->ops->get_exts)) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } exts = tp->ops->get_exts(tp, n->handle); if (unlikely(!exts || n->exts != exts)) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } n = NULL; err = tcf_exts_exec_ex(skb, exts, act_index, res); } else { if (tp->protocol != protocol && tp->protocol != htons(ETH_P_ALL)) continue; err = tc_classify(skb, tp, res); } #ifdef CONFIG_NET_CLS_ACT if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) { first_tp = orig_tp; *last_executed_chain = first_tp->chain->index; goto reset; } else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) { first_tp = res->goto_tp; *last_executed_chain = err & TC_ACT_EXT_VAL_MASK; goto reset; } #endif if (err >= 0) return err; } if (unlikely(n)) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } return TC_ACT_UNSPEC; /* signal: continue lookup */ #ifdef CONFIG_NET_CLS_ACT reset: if (unlikely(limit++ >= max_reclassify_loop)) { net_notice_ratelimited("%u: reclassify loop, rule prio %u, protocol %02x\n", tp->chain->block->index, tp->prio & 0xffff, ntohs(tp->protocol)); tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); return TC_ACT_SHOT; } tp = first_tp; goto reclassify; #endif } int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) u32 last_executed_chain = 0; return __tcf_classify(skb, tp, tp, res, compat_mode, NULL, 0, &last_executed_chain); #else u32 last_executed_chain = tp ? tp->chain->index : 0; struct tcf_exts_miss_cookie_node *n = NULL; const struct tcf_proto *orig_tp = tp; struct tc_skb_ext *ext; int act_index = 0; int ret; if (block) { ext = skb_ext_find(skb, TC_SKB_EXT); if (ext && (ext->chain || ext->act_miss)) { struct tcf_chain *fchain; u32 chain; if (ext->act_miss) { n = tcf_exts_miss_cookie_lookup(ext->act_miss_cookie, &act_index); if (!n) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } chain = n->chain_index; } else { chain = ext->chain; } fchain = tcf_chain_lookup_rcu(block, chain); if (!fchain) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_CHAIN_NOTFOUND); return TC_ACT_SHOT; } /* Consume, so cloned/redirect skbs won't inherit ext */ skb_ext_del(skb, TC_SKB_EXT); tp = rcu_dereference_bh(fchain->filter_chain); last_executed_chain = fchain->index; } } ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, n, act_index, &last_executed_chain); if (tc_skb_ext_tc_enabled()) { /* If we missed on some chain */ if (ret == TC_ACT_UNSPEC && last_executed_chain) { struct tc_skb_cb *cb = tc_skb_cb(skb); ext = tc_skb_ext_alloc(skb); if (WARN_ON_ONCE(!ext)) { tcf_set_drop_reason(skb, SKB_DROP_REASON_NOMEM); return TC_ACT_SHOT; } ext->chain = last_executed_chain; ext->mru = cb->mru; ext->post_ct = cb->post_ct; ext->post_ct_snat = cb->post_ct_snat; ext->post_ct_dnat = cb->post_ct_dnat; ext->zone = cb->zone; } } return ret; #endif } EXPORT_SYMBOL(tcf_classify); struct tcf_chain_info { struct tcf_proto __rcu **pprev; struct tcf_proto __rcu *next; }; static struct tcf_proto *tcf_chain_tp_prev(struct tcf_chain *chain, struct tcf_chain_info *chain_info) { return tcf_chain_dereference(*chain_info->pprev, chain); } static int tcf_chain_tp_insert(struct tcf_chain *chain, struct tcf_chain_info *chain_info, struct tcf_proto *tp) { if (chain->flushing) return -EAGAIN; RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain, chain_info)); if (*chain_info->pprev == chain->filter_chain) tcf_chain0_head_change(chain, tp); tcf_proto_get(tp); rcu_assign_pointer(*chain_info->pprev, tp); return 0; } static void tcf_chain_tp_remove(struct tcf_chain *chain, struct tcf_chain_info *chain_info, struct tcf_proto *tp) { struct tcf_proto *next = tcf_chain_dereference(chain_info->next, chain); tcf_proto_mark_delete(tp); if (tp == chain->filter_chain) tcf_chain0_head_change(chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); } static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, struct tcf_chain_info *chain_info, u32 protocol, u32 prio, bool prio_allocate, struct netlink_ext_ack *extack); /* Try to insert new proto. * If proto with specified priority already exists, free new proto * and return existing one. */ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, struct tcf_proto *tp_new, u32 protocol, u32 prio, bool rtnl_held) { struct tcf_chain_info chain_info; struct tcf_proto *tp; int err = 0; mutex_lock(&chain->filter_chain_lock); if (tcf_proto_exists_destroying(chain, tp_new)) { mutex_unlock(&chain->filter_chain_lock); tcf_proto_destroy(tp_new, rtnl_held, false, NULL); return ERR_PTR(-EAGAIN); } tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false, NULL); if (!tp) err = tcf_chain_tp_insert(chain, &chain_info, tp_new); mutex_unlock(&chain->filter_chain_lock); if (tp) { tcf_proto_destroy(tp_new, rtnl_held, false, NULL); tp_new = tp; } else if (err) { tcf_proto_destroy(tp_new, rtnl_held, false, NULL); tp_new = ERR_PTR(err); } return tp_new; } static void tcf_chain_tp_delete_empty(struct tcf_chain *chain, struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct tcf_chain_info chain_info; struct tcf_proto *tp_iter; struct tcf_proto **pprev; struct tcf_proto *next; mutex_lock(&chain->filter_chain_lock); /* Atomically find and remove tp from chain. */ for (pprev = &chain->filter_chain; (tp_iter = tcf_chain_dereference(*pprev, chain)); pprev = &tp_iter->next) { if (tp_iter == tp) { chain_info.pprev = pprev; chain_info.next = tp_iter->next; WARN_ON(tp_iter->deleting); break; } } /* Verify that tp still exists and no new filters were inserted * concurrently. * Mark tp for deletion if it is empty. */ if (!tp_iter || !tcf_proto_check_delete(tp)) { mutex_unlock(&chain->filter_chain_lock); return; } tcf_proto_signal_destroying(chain, tp); next = tcf_chain_dereference(chain_info.next, chain); if (tp == chain->filter_chain) tcf_chain0_head_change(chain, next); RCU_INIT_POINTER(*chain_info.pprev, next); mutex_unlock(&chain->filter_chain_lock); tcf_proto_put(tp, rtnl_held, extack); } static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, struct tcf_chain_info *chain_info, u32 protocol, u32 prio, bool prio_allocate, struct netlink_ext_ack *extack) { struct tcf_proto **pprev; struct tcf_proto *tp; /* Check the chain for existence of proto-tcf with this priority */ for (pprev = &chain->filter_chain; (tp = tcf_chain_dereference(*pprev, chain)); pprev = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { if (prio_allocate) { NL_SET_ERR_MSG(extack, "Lowest ID from auto-alloc range already in use"); return ERR_PTR(-ENOSPC); } if (tp->protocol != protocol && protocol) { NL_SET_ERR_MSG(extack, "Protocol mismatch for filter with specified priority"); return ERR_PTR(-EINVAL); } } else { tp = NULL; } break; } } chain_info->pprev = pprev; if (tp) { chain_info->next = tp->next; tcf_proto_get(tp); } else { chain_info->next = NULL; } return tp; } static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, u32 portid, u32 seq, u16 flags, int event, bool terse_dump, bool rtnl_held, struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; if (q) { tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = parent; } else { tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; tcm->tcm_block_index = block->index; } tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) goto nla_put_failure; if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index)) goto nla_put_failure; if (!fh) { tcm->tcm_handle = 0; } else if (terse_dump) { if (tp->ops->terse_dump) { if (tp->ops->terse_dump(net, tp, fh, skb, tcm, rtnl_held) < 0) goto nla_put_failure; } else { goto cls_op_not_supp; } } else { if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) goto nla_put_failure; } if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto nla_put_failure; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nla_put_failure: cls_op_not_supp: nlmsg_trim(skb, b); return -1; } static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, int event, bool unicast, bool rtnl_held, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; int err = 0; if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event, false, rtnl_held, extack) <= 0) { kfree_skb(skb); return -EINVAL; } if (unicast) err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); return err; } static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; int err; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return tp->ops->delete(tp, fh, last, rtnl_held, extack); skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, false, rtnl_held, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; } err = tp->ops->delete(tp, fh, last, rtnl_held, extack); if (err) { kfree_skb(skb); return err; } err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter delete notification"); return err; } static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct tcf_block *block, struct Qdisc *q, u32 parent, struct nlmsghdr *n, struct tcf_chain *chain, int event, struct netlink_ext_ack *extack) { struct tcf_proto *tp; for (tp = tcf_get_next_proto(chain, NULL); tp; tp = tcf_get_next_proto(chain, tp)) tfilter_notify(net, oskb, n, tp, block, q, parent, NULL, event, false, true, extack); } static void tfilter_put(struct tcf_proto *tp, void *fh) { if (tp->ops->put && fh) tp->ops->put(tp, fh); } static bool is_qdisc_ingress(__u32 classid) { return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS)); } static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; bool prio_allocate; u32 parent; u32 chain_index; struct Qdisc *q; struct tcf_chain_info chain_info; struct tcf_chain *chain; struct tcf_block *block; struct tcf_proto *tp; unsigned long cl; void *fh; int err; int tp_created; bool rtnl_held = false; u32 flags; replay: tp_created = 0; err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); prio_allocate = false; parent = t->tcm_parent; tp = NULL; cl = 0; block = NULL; q = NULL; chain = NULL; flags = 0; if (prio == 0) { /* If no priority is provided by the user, * we allocate one. */ if (n->nlmsg_flags & NLM_F_CREATE) { prio = TC_H_MAKE(0x80000000U, 0U); prio_allocate = true; } else { NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero"); return -ENOENT; } } /* Find head of filter chain. */ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack); if (err) return err; if (tcf_proto_check_kind(tca[TCA_KIND], name)) { NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); err = -EINVAL; goto errout; } /* Take rtnl mutex if rtnl_held was set to true on previous iteration, * block is shared (no qdisc found), qdisc is not unlocked, classifier * type is not specified, classifier is not unlocked. */ if (rtnl_held || (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack); if (err) goto errout; block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index, extack); if (IS_ERR(block)) { err = PTR_ERR(block); goto errout; } block->classid = parent; chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; goto errout; } chain = tcf_chain_get(block, chain_index, true); if (!chain) { NL_SET_ERR_MSG(extack, "Cannot create specified filter chain"); err = -ENOMEM; goto errout; } mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, prio_allocate, extack); if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout_locked; } if (tp == NULL) { struct tcf_proto *tp_new = NULL; if (chain->flushing) { err = -EAGAIN; goto errout_locked; } /* Proto-tcf does not exist, create new one */ if (tca[TCA_KIND] == NULL || !protocol) { NL_SET_ERR_MSG(extack, "Filter kind and protocol must be specified"); err = -EINVAL; goto errout_locked; } if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); err = -ENOENT; goto errout_locked; } if (prio_allocate) prio = tcf_auto_prio(tcf_chain_tp_prev(chain, &chain_info)); mutex_unlock(&chain->filter_chain_lock); tp_new = tcf_proto_create(name, protocol, prio, chain, rtnl_held, extack); if (IS_ERR(tp_new)) { err = PTR_ERR(tp_new); goto errout_tp; } tp_created = 1; tp = tcf_chain_tp_insert_unique(chain, tp_new, protocol, prio, rtnl_held); if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout_tp; } } else { mutex_unlock(&chain->filter_chain_lock); } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); err = -EINVAL; goto errout; } fh = tp->ops->get(tp, t->tcm_handle); if (!fh) { if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); err = -ENOENT; goto errout; } } else if (n->nlmsg_flags & NLM_F_EXCL) { tfilter_put(tp, fh); NL_SET_ERR_MSG(extack, "Filter already exists"); err = -EEXIST; goto errout; } if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) { tfilter_put(tp, fh); NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind"); err = -EINVAL; goto errout; } if (!(n->nlmsg_flags & NLM_F_CREATE)) flags |= TCA_ACT_FLAGS_REPLACE; if (!rtnl_held) flags |= TCA_ACT_FLAGS_NO_RTNL; if (is_qdisc_ingress(parent)) flags |= TCA_ACT_FLAGS_AT_INGRESS; err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, flags, extack); if (err == 0) { tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); tcf_block_filter_cnt_update(block, &tp->counted, true); /* q pointer is NULL for shared blocks */ if (q) q->flags &= ~TCQ_F_CAN_BYPASS; } errout: if (err && tp_created) tcf_chain_tp_delete_empty(chain, tp, rtnl_held, NULL); errout_tp: if (chain) { if (tp && !IS_ERR(tp)) tcf_proto_put(tp, rtnl_held, NULL); if (!tp_created) tcf_chain_put(chain); } tcf_block_release(q, block, rtnl_held); if (rtnl_held) rtnl_unlock(); if (err == -EAGAIN) { /* Take rtnl lock in case EAGAIN is caused by concurrent flush * of target chain. */ rtnl_held = true; /* Replay the request. */ goto replay; } return err; errout_locked: mutex_unlock(&chain->filter_chain_lock); goto errout; } static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; u32 parent; u32 chain_index; struct Qdisc *q = NULL; struct tcf_chain_info chain_info; struct tcf_chain *chain = NULL; struct tcf_block *block = NULL; struct tcf_proto *tp = NULL; unsigned long cl = 0; void *fh = NULL; int err; bool rtnl_held = false; err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); parent = t->tcm_parent; if (prio == 0 && (protocol || t->tcm_handle || tca[TCA_KIND])) { NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set"); return -ENOENT; } /* Find head of filter chain. */ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack); if (err) return err; if (tcf_proto_check_kind(tca[TCA_KIND], name)) { NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); err = -EINVAL; goto errout; } /* Take rtnl mutex if flushing whole chain, block is shared (no qdisc * found), qdisc is not unlocked, classifier type is not specified, * classifier is not unlocked. */ if (!prio || (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack); if (err) goto errout; block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index, extack); if (IS_ERR(block)) { err = PTR_ERR(block); goto errout; } chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; goto errout; } chain = tcf_chain_get(block, chain_index, false); if (!chain) { /* User requested flush on non-existent chain. Nothing to do, * so just return success. */ if (prio == 0) { err = 0; goto errout; } NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); err = -ENOENT; goto errout; } if (prio == 0) { tfilter_notify_chain(net, skb, block, q, parent, n, chain, RTM_DELTFILTER, extack); tcf_chain_flush(chain, rtnl_held); err = 0; goto errout; } mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false, extack); if (!tp) { err = -ENOENT; NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); goto errout_locked; } else if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout_locked; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); err = -EINVAL; goto errout_locked; } else if (t->tcm_handle == 0) { tcf_proto_signal_destroying(chain, tp); tcf_chain_tp_remove(chain, &chain_info, tp); mutex_unlock(&chain->filter_chain_lock); tcf_proto_put(tp, rtnl_held, NULL); tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_DELTFILTER, false, rtnl_held, extack); err = 0; goto errout; } mutex_unlock(&chain->filter_chain_lock); fh = tp->ops->get(tp, t->tcm_handle); if (!fh) { NL_SET_ERR_MSG(extack, "Specified filter handle not found"); err = -ENOENT; } else { bool last; err = tfilter_del_notify(net, skb, n, tp, block, q, parent, fh, &last, rtnl_held, extack); if (err) goto errout; if (last) tcf_chain_tp_delete_empty(chain, tp, rtnl_held, extack); } errout: if (chain) { if (tp && !IS_ERR(tp)) tcf_proto_put(tp, rtnl_held, NULL); tcf_chain_put(chain); } tcf_block_release(q, block, rtnl_held); if (rtnl_held) rtnl_unlock(); return err; errout_locked: mutex_unlock(&chain->filter_chain_lock); goto errout; } static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; u32 parent; u32 chain_index; struct Qdisc *q = NULL; struct tcf_chain_info chain_info; struct tcf_chain *chain = NULL; struct tcf_block *block = NULL; struct tcf_proto *tp = NULL; unsigned long cl = 0; void *fh = NULL; int err; bool rtnl_held = false; err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); parent = t->tcm_parent; if (prio == 0) { NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero"); return -ENOENT; } /* Find head of filter chain. */ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack); if (err) return err; if (tcf_proto_check_kind(tca[TCA_KIND], name)) { NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); err = -EINVAL; goto errout; } /* Take rtnl mutex if block is shared (no qdisc found), qdisc is not * unlocked, classifier type is not specified, classifier is not * unlocked. */ if ((q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack); if (err) goto errout; block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index, extack); if (IS_ERR(block)) { err = PTR_ERR(block); goto errout; } chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; goto errout; } chain = tcf_chain_get(block, chain_index, false); if (!chain) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); err = -EINVAL; goto errout; } mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false, extack); mutex_unlock(&chain->filter_chain_lock); if (!tp) { err = -ENOENT; NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); goto errout; } else if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); err = -EINVAL; goto errout; } fh = tp->ops->get(tp, t->tcm_handle); if (!fh) { NL_SET_ERR_MSG(extack, "Specified filter handle not found"); err = -ENOENT; } else { err = tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, true, rtnl_held, NULL); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); } tfilter_put(tp, fh); errout: if (chain) { if (tp && !IS_ERR(tp)) tcf_proto_put(tp, rtnl_held, NULL); tcf_chain_put(chain); } tcf_block_release(q, block, rtnl_held); if (rtnl_held) rtnl_unlock(); return err; } struct tcf_dump_args { struct tcf_walker w; struct sk_buff *skb; struct netlink_callback *cb; struct tcf_block *block; struct Qdisc *q; u32 parent; bool terse_dump; }; static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) { struct tcf_dump_args *a = (void *)arg; struct net *net = sock_net(a->skb->sk); return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER, a->terse_dump, true, NULL); } static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, struct sk_buff *skb, struct netlink_callback *cb, long index_start, long *p_index, bool terse) { struct net *net = sock_net(skb->sk); struct tcf_block *block = chain->block; struct tcmsg *tcm = nlmsg_data(cb->nlh); struct tcf_proto *tp, *tp_prev; struct tcf_dump_args arg; for (tp = __tcf_get_next_proto(chain, NULL); tp; tp_prev = tp, tp = __tcf_get_next_proto(chain, tp), tcf_proto_put(tp_prev, true, NULL), (*p_index)++) { if (*p_index < index_start) continue; if (TC_H_MAJ(tcm->tcm_info) && TC_H_MAJ(tcm->tcm_info) != tp->prio) continue; if (TC_H_MIN(tcm->tcm_info) && TC_H_MIN(tcm->tcm_info) != tp->protocol) continue; if (*p_index > index_start) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (cb->args[1] == 0) { if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER, false, true, NULL) <= 0) goto errout; cb->args[1] = 1; } if (!tp->ops->walk) continue; arg.w.fn = tcf_node_dump; arg.skb = skb; arg.cb = cb; arg.block = block; arg.q = q; arg.parent = parent; arg.w.stop = 0; arg.w.skip = cb->args[1] - 1; arg.w.count = 0; arg.w.cookie = cb->args[2]; arg.terse_dump = terse; tp->ops->walk(tp, &arg.w, true); cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; if (arg.w.stop) goto errout; } return true; errout: tcf_proto_put(tp, true, NULL); return false; } static const struct nla_policy tcf_tfilter_dump_policy[TCA_MAX + 1] = { [TCA_CHAIN] = { .type = NLA_U32 }, [TCA_DUMP_FLAGS] = NLA_POLICY_BITFIELD32(TCA_DUMP_FLAGS_TERSE), }; /* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { struct tcf_chain *chain, *chain_prev; struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; struct Qdisc *q = NULL; struct tcf_block *block; struct tcmsg *tcm = nlmsg_data(cb->nlh); bool terse_dump = false; long index_start; long index; u32 parent; int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX, tcf_tfilter_dump_policy, cb->extack); if (err) return err; if (tca[TCA_DUMP_FLAGS]) { struct nla_bitfield32 flags = nla_get_bitfield32(tca[TCA_DUMP_FLAGS]); terse_dump = flags.value & TCA_DUMP_FLAGS_TERSE; } if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) goto out; /* If we work with block index, q is NULL and parent value * will never be used in the following code. The check * in tcf_fill_node prevents it. However, compiler does not * see that far, so set parent to zero to silence the warning * about parent being uninitialized. */ parent = 0; } else { const struct Qdisc_class_ops *cops; struct net_device *dev; unsigned long cl = 0; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return skb->len; parent = tcm->tcm_parent; if (!parent) q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); if (!q) goto out; cops = q->ops->cl_ops; if (!cops) goto out; if (!cops->tcf_block) goto out; if (TC_H_MIN(tcm->tcm_parent)) { cl = cops->find(q, tcm->tcm_parent); if (cl == 0) goto out; } block = cops->tcf_block(q, cl, NULL); if (!block) goto out; parent = block->classid; if (tcf_block_shared(block)) q = NULL; } index_start = cb->args[0]; index = 0; for (chain = __tcf_get_next_chain(block, NULL); chain; chain_prev = chain, chain = __tcf_get_next_chain(block, chain), tcf_chain_put(chain_prev)) { if (tca[TCA_CHAIN] && nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; if (!tcf_chain_dump(chain, q, parent, skb, cb, index_start, &index, terse_dump)) { tcf_chain_put(chain); err = -EMSGSIZE; break; } } if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) tcf_block_refcnt_put(block, true); cb->args[0] = index; out: /* If we did no progress, the error (EMSGSIZE) is real */ if (skb->len == 0 && err) return err; return skb->len; } static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct net *net, struct sk_buff *skb, struct tcf_block *block, u32 portid, u32 seq, u16 flags, int event, struct netlink_ext_ack *extack) { unsigned char *b = skb_tail_pointer(skb); const struct tcf_proto_ops *ops; struct nlmsghdr *nlh; struct tcmsg *tcm; void *priv; ops = tmplt_ops; priv = tmplt_priv; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_handle = 0; if (block->q) { tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex; tcm->tcm_parent = block->q->handle; } else { tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; tcm->tcm_block_index = block->index; } if (nla_put_u32(skb, TCA_CHAIN, chain_index)) goto nla_put_failure; if (ops) { if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; if (ops->tmplt_dump(skb, net, priv) < 0) goto nla_put_failure; } if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return -EMSGSIZE; } static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, u32 seq, u16 flags, int event, bool unicast, struct netlink_ext_ack *extack) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct tcf_block *block = chain->block; struct net *net = block->net; struct sk_buff *skb; int err = 0; if (!unicast && !rtnl_notify_needed(net, flags, RTNLGRP_TC)) return 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv, chain->index, net, skb, block, portid, seq, flags, event, extack) <= 0) { kfree_skb(skb); return -EINVAL; } if (unicast) err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); return err; } static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct tcf_block *block, struct sk_buff *oskb, u32 seq, u16 flags) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct net *net = block->net; struct sk_buff *skb; if (!rtnl_notify_needed(net, flags, RTNLGRP_TC)) return 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_chain_fill_node(tmplt_ops, tmplt_priv, chain_index, net, skb, block, portid, seq, flags, RTM_DELCHAIN, NULL) <= 0) { kfree_skb(skb); return -EINVAL; } return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); } static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, struct nlattr **tca, struct netlink_ext_ack *extack) { const struct tcf_proto_ops *ops; char name[IFNAMSIZ]; void *tmplt_priv; /* If kind is not set, user did not specify template. */ if (!tca[TCA_KIND]) return 0; if (tcf_proto_check_kind(tca[TCA_KIND], name)) { NL_SET_ERR_MSG(extack, "Specified TC chain template name too long"); return -EINVAL; } ops = tcf_proto_lookup_ops(name, true, extack); if (IS_ERR(ops)) return PTR_ERR(ops); if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump || !ops->tmplt_reoffload) { NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier"); module_put(ops->owner); return -EOPNOTSUPP; } tmplt_priv = ops->tmplt_create(net, chain, tca, extack); if (IS_ERR(tmplt_priv)) { module_put(ops->owner); return PTR_ERR(tmplt_priv); } chain->tmplt_ops = ops; chain->tmplt_priv = tmplt_priv; return 0; } static void tc_chain_tmplt_del(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv) { /* If template ops are set, no work to do for us. */ if (!tmplt_ops) return; tmplt_ops->tmplt_destroy(tmplt_priv); module_put(tmplt_ops->owner); } /* Add/delete/get a chain */ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; struct tcmsg *t; u32 parent; u32 chain_index; struct Qdisc *q; struct tcf_chain *chain; struct tcf_block *block; unsigned long cl; int err; replay: q = NULL; err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; t = nlmsg_data(n); parent = t->tcm_parent; cl = 0; block = tcf_block_find(net, &q, &parent, &cl, t->tcm_ifindex, t->tcm_block_index, extack); if (IS_ERR(block)) return PTR_ERR(block); chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; goto errout_block; } mutex_lock(&block->lock); chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { if (chain) { if (tcf_chain_held_by_acts_only(chain)) { /* The chain exists only because there is * some action referencing it. */ tcf_chain_hold(chain); } else { NL_SET_ERR_MSG(extack, "Filter chain already exists"); err = -EEXIST; goto errout_block_locked; } } else { if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); err = -ENOENT; goto errout_block_locked; } chain = tcf_chain_create(block, chain_index); if (!chain) { NL_SET_ERR_MSG(extack, "Failed to create filter chain"); err = -ENOMEM; goto errout_block_locked; } } } else { if (!chain || tcf_chain_held_by_acts_only(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); err = -EINVAL; goto errout_block_locked; } tcf_chain_hold(chain); } if (n->nlmsg_type == RTM_NEWCHAIN) { /* Modifying chain requires holding parent block lock. In case * the chain was successfully added, take a reference to the * chain. This ensures that an empty chain does not disappear at * the end of this function. */ tcf_chain_hold(chain); chain->explicitly_created = true; } mutex_unlock(&block->lock); switch (n->nlmsg_type) { case RTM_NEWCHAIN: err = tc_chain_tmplt_add(chain, net, tca, extack); if (err) { tcf_chain_put_explicitly_created(chain); goto errout; } tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, RTM_NEWCHAIN, false, extack); break; case RTM_DELCHAIN: tfilter_notify_chain(net, skb, block, q, parent, n, chain, RTM_DELTFILTER, extack); /* Flush the chain first as the user requested chain removal. */ tcf_chain_flush(chain, true); /* In case the chain was successfully deleted, put a reference * to the chain previously taken during addition. */ tcf_chain_put_explicitly_created(chain); break; case RTM_GETCHAIN: err = tc_chain_notify(chain, skb, n->nlmsg_seq, n->nlmsg_flags, n->nlmsg_type, true, extack); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send chain notify message"); break; default: err = -EOPNOTSUPP; NL_SET_ERR_MSG(extack, "Unsupported message type"); goto errout; } errout: tcf_chain_put(chain); errout_block: tcf_block_release(q, block, true); if (err == -EAGAIN) /* Replay the request. */ goto replay; return err; errout_block_locked: mutex_unlock(&block->lock); goto errout_block; } /* called with RTNL */ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; struct Qdisc *q = NULL; struct tcf_block *block; struct tcmsg *tcm = nlmsg_data(cb->nlh); struct tcf_chain *chain; long index_start; long index; int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, cb->extack); if (err) return err; if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) goto out; } else { const struct Qdisc_class_ops *cops; struct net_device *dev; unsigned long cl = 0; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return skb->len; if (!tcm->tcm_parent) q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); if (!q) goto out; cops = q->ops->cl_ops; if (!cops) goto out; if (!cops->tcf_block) goto out; if (TC_H_MIN(tcm->tcm_parent)) { cl = cops->find(q, tcm->tcm_parent); if (cl == 0) goto out; } block = cops->tcf_block(q, cl, NULL); if (!block) goto out; if (tcf_block_shared(block)) q = NULL; } index_start = cb->args[0]; index = 0; mutex_lock(&block->lock); list_for_each_entry(chain, &block->chain_list, list) { if ((tca[TCA_CHAIN] && nla_get_u32(tca[TCA_CHAIN]) != chain->index)) continue; if (index < index_start) { index++; continue; } if (tcf_chain_held_by_acts_only(chain)) continue; err = tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv, chain->index, net, skb, block, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWCHAIN, NULL); if (err <= 0) break; index++; } mutex_unlock(&block->lock); if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) tcf_block_refcnt_put(block, true); cb->args[0] = index; out: /* If we did no progress, the error (EMSGSIZE) is real */ if (skb->len == 0 && err) return err; return skb->len; } int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, int police, struct tcf_proto *tp, u32 handle, bool use_action_miss) { int err = 0; #ifdef CONFIG_NET_CLS_ACT exts->type = 0; exts->nr_actions = 0; exts->miss_cookie_node = NULL; /* Note: we do not own yet a reference on net. * This reference might be taken later from tcf_exts_get_net(). */ exts->net = net; exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), GFP_KERNEL); if (!exts->actions) return -ENOMEM; #endif exts->action = action; exts->police = police; if (!use_action_miss) return 0; err = tcf_exts_miss_cookie_base_alloc(exts, tp, handle); if (err) goto err_miss_alloc; return 0; err_miss_alloc: tcf_exts_destroy(exts); #ifdef CONFIG_NET_CLS_ACT exts->actions = NULL; #endif return err; } EXPORT_SYMBOL(tcf_exts_init_ex); void tcf_exts_destroy(struct tcf_exts *exts) { tcf_exts_miss_cookie_base_destroy(exts); #ifdef CONFIG_NET_CLS_ACT if (exts->actions) { tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); kfree(exts->actions); } exts->nr_actions = 0; #endif } EXPORT_SYMBOL(tcf_exts_destroy); int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT { int init_res[TCA_ACT_MAX_PRIO] = {}; struct tc_action *act; size_t attr_size = 0; if (exts->police && tb[exts->police]) { struct tc_action_ops *a_o; flags |= TCA_ACT_FLAGS_POLICE | TCA_ACT_FLAGS_BIND; a_o = tc_action_load_ops(tb[exts->police], flags, extack); if (IS_ERR(a_o)) return PTR_ERR(a_o); act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, a_o, init_res, flags, extack); module_put(a_o->owner); if (IS_ERR(act)) return PTR_ERR(act); act->type = exts->type = TCA_OLD_COMPAT; exts->actions[0] = act; exts->nr_actions = 1; tcf_idr_insert_many(exts->actions, init_res); } else if (exts->action && tb[exts->action]) { int err; flags |= TCA_ACT_FLAGS_BIND; err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, exts->actions, init_res, &attr_size, flags, fl_flags, extack); if (err < 0) return err; exts->nr_actions = err; } } #else if ((exts->action && tb[exts->action]) || (exts->police && tb[exts->police])) { NL_SET_ERR_MSG(extack, "Classifier actions are not supported per compile options (CONFIG_NET_CLS_ACT)"); return -EOPNOTSUPP; } #endif return 0; } EXPORT_SYMBOL(tcf_exts_validate_ex); int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, struct netlink_ext_ack *extack) { return tcf_exts_validate_ex(net, tp, tb, rate_tlv, exts, flags, 0, extack); } EXPORT_SYMBOL(tcf_exts_validate); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src) { #ifdef CONFIG_NET_CLS_ACT struct tcf_exts old = *dst; *dst = *src; tcf_exts_destroy(&old); #endif } EXPORT_SYMBOL(tcf_exts_change); #ifdef CONFIG_NET_CLS_ACT static struct tc_action *tcf_exts_first_act(struct tcf_exts *exts) { if (exts->nr_actions == 0) return NULL; else return exts->actions[0]; } #endif int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT struct nlattr *nest; if (exts->action && tcf_exts_has_actions(exts)) { /* * again for backward compatible mode - we want * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ if (exts->type != TCA_OLD_COMPAT) { nest = nla_nest_start_noflag(skb, exts->action); if (nest == NULL) goto nla_put_failure; if (tcf_action_dump(skb, exts->actions, 0, 0, false) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { struct tc_action *act = tcf_exts_first_act(exts); nest = nla_nest_start_noflag(skb, exts->police); if (nest == NULL || !act) goto nla_put_failure; if (tcf_action_dump_old(skb, act, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } } return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -1; #else return 0; #endif } EXPORT_SYMBOL(tcf_exts_dump); int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT struct nlattr *nest; if (!exts->action || !tcf_exts_has_actions(exts)) return 0; nest = nla_nest_start_noflag(skb, exts->action); if (!nest) goto nla_put_failure; if (tcf_action_dump(skb, exts->actions, 0, 0, true) < 0) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -1; #else return 0; #endif } EXPORT_SYMBOL(tcf_exts_terse_dump); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT struct tc_action *a = tcf_exts_first_act(exts); if (a != NULL && tcf_action_copy_stats(skb, a, 1) < 0) return -1; #endif return 0; } EXPORT_SYMBOL(tcf_exts_dump_stats); static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) { if (*flags & TCA_CLS_FLAGS_IN_HW) return; *flags |= TCA_CLS_FLAGS_IN_HW; if (tc_skip_sw(*flags)) atomic_inc(&block->skipswcnt); atomic_inc(&block->offloadcnt); } static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) { if (!(*flags & TCA_CLS_FLAGS_IN_HW)) return; *flags &= ~TCA_CLS_FLAGS_IN_HW; if (tc_skip_sw(*flags)) atomic_dec(&block->skipswcnt); atomic_dec(&block->offloadcnt); } static void tc_cls_offload_cnt_update(struct tcf_block *block, struct tcf_proto *tp, u32 *cnt, u32 *flags, u32 diff, bool add) { lockdep_assert_held(&block->cb_lock); spin_lock(&tp->lock); if (add) { if (!*cnt) tcf_block_offload_inc(block, flags); *cnt += diff; } else { *cnt -= diff; if (!*cnt) tcf_block_offload_dec(block, flags); } spin_unlock(&tp->lock); } static void tc_cls_offload_cnt_reset(struct tcf_block *block, struct tcf_proto *tp, u32 *cnt, u32 *flags) { lockdep_assert_held(&block->cb_lock); spin_lock(&tp->lock); tcf_block_offload_dec(block, flags); *cnt = 0; spin_unlock(&tp->lock); } static int __tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop) { struct flow_block_cb *block_cb; int ok_count = 0; int err; list_for_each_entry(block_cb, &block->flow_block.cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err) { if (err_stop) return err; } else { ok_count++; } } return ok_count; } int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held) { bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; int ok_count; retry: if (take_rtnl) rtnl_lock(); down_read(&block->cb_lock); /* Need to obtain rtnl lock if block is bound to devs that require it. * In block bind code cb_lock is obtained while holding rtnl, so we must * obtain the locks in same order here. */ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { up_read(&block->cb_lock); take_rtnl = true; goto retry; } ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); up_read(&block->cb_lock); if (take_rtnl) rtnl_unlock(); return ok_count; } EXPORT_SYMBOL(tc_setup_cb_call); /* Non-destructive filter add. If filter that wasn't already in hardware is * successfully offloaded, increment block offloads counter. On failure, * previously offloaded filter is considered to be intact and offloads counter * is not decremented. */ int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held) { bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; int ok_count; retry: if (take_rtnl) rtnl_lock(); down_read(&block->cb_lock); /* Need to obtain rtnl lock if block is bound to devs that require it. * In block bind code cb_lock is obtained while holding rtnl, so we must * obtain the locks in same order here. */ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { up_read(&block->cb_lock); take_rtnl = true; goto retry; } /* Make sure all netdevs sharing this block are offload-capable. */ if (block->nooffloaddevcnt && err_stop) { ok_count = -EOPNOTSUPP; goto err_unlock; } ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); if (ok_count < 0) goto err_unlock; if (tp->ops->hw_add) tp->ops->hw_add(tp, type_data); if (ok_count > 0) tc_cls_offload_cnt_update(block, tp, in_hw_count, flags, ok_count, true); err_unlock: up_read(&block->cb_lock); if (take_rtnl) rtnl_unlock(); return min(ok_count, 0); } EXPORT_SYMBOL(tc_setup_cb_add); /* Destructive filter replace. If filter that wasn't already in hardware is * successfully offloaded, increment block offload counter. On failure, * previously offloaded filter is considered to be destroyed and offload counter * is decremented. */ int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *old_flags, unsigned int *old_in_hw_count, u32 *new_flags, unsigned int *new_in_hw_count, bool rtnl_held) { bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; int ok_count; retry: if (take_rtnl) rtnl_lock(); down_read(&block->cb_lock); /* Need to obtain rtnl lock if block is bound to devs that require it. * In block bind code cb_lock is obtained while holding rtnl, so we must * obtain the locks in same order here. */ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { up_read(&block->cb_lock); take_rtnl = true; goto retry; } /* Make sure all netdevs sharing this block are offload-capable. */ if (block->nooffloaddevcnt && err_stop) { ok_count = -EOPNOTSUPP; goto err_unlock; } tc_cls_offload_cnt_reset(block, tp, old_in_hw_count, old_flags); if (tp->ops->hw_del) tp->ops->hw_del(tp, type_data); ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); if (ok_count < 0) goto err_unlock; if (tp->ops->hw_add) tp->ops->hw_add(tp, type_data); if (ok_count > 0) tc_cls_offload_cnt_update(block, tp, new_in_hw_count, new_flags, ok_count, true); err_unlock: up_read(&block->cb_lock); if (take_rtnl) rtnl_unlock(); return min(ok_count, 0); } EXPORT_SYMBOL(tc_setup_cb_replace); /* Destroy filter and decrement block offload counter, if filter was previously * offloaded. */ int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held) { bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; int ok_count; retry: if (take_rtnl) rtnl_lock(); down_read(&block->cb_lock); /* Need to obtain rtnl lock if block is bound to devs that require it. * In block bind code cb_lock is obtained while holding rtnl, so we must * obtain the locks in same order here. */ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { up_read(&block->cb_lock); take_rtnl = true; goto retry; } ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); tc_cls_offload_cnt_reset(block, tp, in_hw_count, flags); if (tp->ops->hw_del) tp->ops->hw_del(tp, type_data); up_read(&block->cb_lock); if (take_rtnl) rtnl_unlock(); return min(ok_count, 0); } EXPORT_SYMBOL(tc_setup_cb_destroy); int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, enum tc_setup_type type, void *type_data, void *cb_priv, u32 *flags, unsigned int *in_hw_count) { int err = cb(type, type_data, cb_priv); if (err) { if (add && tc_skip_sw(*flags)) return err; } else { tc_cls_offload_cnt_update(block, tp, in_hw_count, flags, 1, add); } return 0; } EXPORT_SYMBOL(tc_setup_cb_reoffload); static int tcf_act_get_user_cookie(struct flow_action_entry *entry, const struct tc_action *act) { struct tc_cookie *user_cookie; int err = 0; rcu_read_lock(); user_cookie = rcu_dereference(act->user_cookie); if (user_cookie) { entry->user_cookie = flow_action_cookie_create(user_cookie->data, user_cookie->len, GFP_ATOMIC); if (!entry->user_cookie) err = -ENOMEM; } rcu_read_unlock(); return err; } static void tcf_act_put_user_cookie(struct flow_action_entry *entry) { flow_action_cookie_destroy(entry->user_cookie); } void tc_cleanup_offload_action(struct flow_action *flow_action) { struct flow_action_entry *entry; int i; flow_action_for_each(i, entry, flow_action) { tcf_act_put_user_cookie(entry); if (entry->destructor) entry->destructor(entry->destructor_priv); } } EXPORT_SYMBOL(tc_cleanup_offload_action); static int tc_setup_offload_act(struct tc_action *act, struct flow_action_entry *entry, u32 *index_inc, struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT if (act->ops->offload_act_setup) { return act->ops->offload_act_setup(act, entry, index_inc, true, extack); } else { NL_SET_ERR_MSG(extack, "Action does not support offload"); return -EOPNOTSUPP; } #else return 0; #endif } int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], u32 miss_cookie_base, struct netlink_ext_ack *extack) { int i, j, k, index, err = 0; struct tc_action *act; BUILD_BUG_ON(TCA_ACT_HW_STATS_ANY != FLOW_ACTION_HW_STATS_ANY); BUILD_BUG_ON(TCA_ACT_HW_STATS_IMMEDIATE != FLOW_ACTION_HW_STATS_IMMEDIATE); BUILD_BUG_ON(TCA_ACT_HW_STATS_DELAYED != FLOW_ACTION_HW_STATS_DELAYED); if (!actions) return 0; j = 0; tcf_act_for_each_action(i, act, actions) { struct flow_action_entry *entry; entry = &flow_action->entries[j]; spin_lock_bh(&act->tcfa_lock); err = tcf_act_get_user_cookie(entry, act); if (err) goto err_out_locked; index = 0; err = tc_setup_offload_act(act, entry, &index, extack); if (err) goto err_out_locked; for (k = 0; k < index ; k++) { entry[k].hw_stats = tc_act_hw_stats(act->hw_stats); entry[k].hw_index = act->tcfa_index; entry[k].cookie = (unsigned long)act; entry[k].miss_cookie = tcf_exts_miss_cookie_get(miss_cookie_base, i); } j += index; spin_unlock_bh(&act->tcfa_lock); } err_out: if (err) tc_cleanup_offload_action(flow_action); return err; err_out_locked: spin_unlock_bh(&act->tcfa_lock); goto err_out; } int tc_setup_offload_action(struct flow_action *flow_action, const struct tcf_exts *exts, struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT u32 miss_cookie_base; if (!exts) return 0; miss_cookie_base = exts->miss_cookie_node ? exts->miss_cookie_node->miss_cookie_base : 0; return tc_setup_action(flow_action, exts->actions, miss_cookie_base, extack); #else return 0; #endif } EXPORT_SYMBOL(tc_setup_offload_action); unsigned int tcf_exts_num_actions(struct tcf_exts *exts) { unsigned int num_acts = 0; struct tc_action *act; int i; tcf_exts_for_each_action(i, act, exts) { if (is_tcf_pedit(act)) num_acts += tcf_pedit_nkeys(act); else num_acts++; } return num_acts; } EXPORT_SYMBOL(tcf_exts_num_actions); #ifdef CONFIG_NET_CLS_ACT static int tcf_qevent_parse_block_index(struct nlattr *block_index_attr, u32 *p_block_index, struct netlink_ext_ack *extack) { *p_block_index = nla_get_u32(block_index_attr); if (!*p_block_index) { NL_SET_ERR_MSG(extack, "Block number may not be zero"); return -EINVAL; } return 0; } int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { u32 block_index; int err; if (!block_index_attr) return 0; err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack); if (err) return err; qe->info.binder_type = binder_type; qe->info.chain_head_change = tcf_chain_head_change_dflt; qe->info.chain_head_change_priv = &qe->filter_chain; qe->info.block_index = block_index; return tcf_block_get_ext(&qe->block, sch, &qe->info, extack); } EXPORT_SYMBOL(tcf_qevent_init); void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) { if (qe->info.block_index) tcf_block_put_ext(qe->block, sch, &qe->info); } EXPORT_SYMBOL(tcf_qevent_destroy); int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { u32 block_index; int err; if (!block_index_attr) return 0; err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack); if (err) return err; /* Bounce newly-configured block or change in block. */ if (block_index != qe->info.block_index) { NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); return -EINVAL; } return 0; } EXPORT_SYMBOL(tcf_qevent_validate_change); struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret) { struct tcf_result cl_res; struct tcf_proto *fl; if (!qe->info.block_index) return skb; fl = rcu_dereference_bh(qe->filter_chain); switch (tcf_classify(skb, NULL, fl, &cl_res, false)) { case TC_ACT_SHOT: qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); *ret = __NET_XMIT_BYPASS; return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: __qdisc_drop(skb, to_free); *ret = __NET_XMIT_STOLEN; return NULL; case TC_ACT_REDIRECT: skb_do_redirect(skb); *ret = __NET_XMIT_STOLEN; return NULL; } return skb; } EXPORT_SYMBOL(tcf_qevent_handle); int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) { if (!qe->info.block_index) return 0; return nla_put_u32(skb, attr_name, qe->info.block_index); } EXPORT_SYMBOL(tcf_qevent_dump); #endif static __net_init int tcf_net_init(struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); spin_lock_init(&tn->idr_lock); idr_init(&tn->idr); return 0; } static void __net_exit tcf_net_exit(struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); idr_destroy(&tn->idr); } static struct pernet_operations tcf_net_ops = { .init = tcf_net_init, .exit = tcf_net_exit, .id = &tcf_net_id, .size = sizeof(struct tcf_net), }; static const struct rtnl_msg_handler tc_filter_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWTFILTER, .doit = tc_new_tfilter, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.msgtype = RTM_DELTFILTER, .doit = tc_del_tfilter, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.msgtype = RTM_GETTFILTER, .doit = tc_get_tfilter, .dumpit = tc_dump_tfilter, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.msgtype = RTM_NEWCHAIN, .doit = tc_ctl_chain}, {.msgtype = RTM_DELCHAIN, .doit = tc_ctl_chain}, {.msgtype = RTM_GETCHAIN, .doit = tc_ctl_chain, .dumpit = tc_dump_chain}, }; static int __init tc_filter_init(void) { int err; tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0); if (!tc_filter_wq) return -ENOMEM; err = register_pernet_subsys(&tcf_net_ops); if (err) goto err_register_pernet_subsys; xa_init_flags(&tcf_exts_miss_cookies_xa, XA_FLAGS_ALLOC1); rtnl_register_many(tc_filter_rtnl_msg_handlers); return 0; err_register_pernet_subsys: destroy_workqueue(tc_filter_wq); return err; } subsys_initcall(tc_filter_init);
1633 1630 64 65 65 55 15 65 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/posix-timers.h> #include <linux/cn_proc.h> #include <linux/mutex.h> #include <linux/futex.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> #include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> #include <linux/rethook.h> #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> #include <uapi/linux/wait.h> #include <asm/unistd.h> #include <asm/mmu_context.h> #include "exit.h" /* * The default value should be high enough to not crash a system that randomly * crashes its kernel from time to time, but low enough to at least not permit * overflowing 32-bit refcounts or the ldsem writer count. */ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL static struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, .maxlen = sizeof(oops_limit), .mode = 0644, .proc_handler = proc_douintvec, }, }; static __init int kernel_exit_sysctls_init(void) { register_sysctl_init("kernel", kern_exit_table); return 0; } late_initcall(kernel_exit_sysctls_init); #endif static atomic_t oops_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); } static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); static __init int kernel_exit_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); return 0; } late_initcall(kernel_exit_sysfs_init); #endif static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; detach_pid(p, PIDTYPE_PID); if (group_dead) { detach_pid(p, PIDTYPE_TGID); detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ static void __exit_signal(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) posix_cpu_timers_exit_group(tsk); #endif if (group_dead) { tty = sig->tty; sig->tty = NULL; } else { /* * If there is any task waiting for the group exit * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); } add_device_randomness((const void*) &tsk->se.sum_exec_runtime, sizeof(unsigned long long)); /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, * but we want to avoid the race with thread_group_cputime() which can * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); } } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } void put_task_struct_rcu_user(struct task_struct *task) { if (refcount_dec_and_test(&task->rcu_users)) call_rcu(&task->rcu, delayed_put_task_struct); } void __weak release_thread(struct task_struct *dead_task) { } void release_task(struct task_struct *p) { struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); thread_pid = get_pid(p->thread_pid); __exit_signal(p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the * group leader's parent process. (if it wants notification.) */ zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. */ zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } int rcuwait_wake_up(struct rcuwait *w) { int ret = 0; struct task_struct *task; rcu_read_lock(); /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE * [S] tsk = current [S] cond = true * MB (A) MB (B) * [L] cond [L] tsk */ smp_mb(); /* (B) */ task = rcu_dereference(w->task); if (task) ret = wake_up_process(task); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are * to receive a SIGHUP and a SIGCONT. * * "I ask you, have you ever known what it is to be an orphan?" */ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if ((p == ignored_task) || (p->exit_state && thread_group_empty(p)) || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return 1; } int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; } static bool has_stopped_jobs(struct pid *pgrp) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->signal->flags & SIGNAL_STOP_STOPPED) return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return false; } /* * Check to see if any process groups have become orphaned as * a result of our exiting, and if they have any stopped jobs, * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) { struct pid *pgrp = task_pgrp(tsk); struct task_struct *ignored_task = tsk; if (!parent) /* exit: our father is in a different pgrp than * we are and we were the only connection outside. */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than * we are, and it was the only connection outside. */ ignored_task = NULL; if (task_pgrp(parent) != pgrp && task_session(parent) == task_session(tsk) && will_become_orphaned_pgrp(pgrp, ignored_task) && has_stopped_jobs(pgrp)) { __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } static void coredump_task_exit(struct task_struct *tsk) { struct core_state *core_state; /* * Serialize with any possible pending coredump. * We must hold siglock around checking core_state * and setting PF_POSTCOREDUMP. The core-inducing thread * will increment ->nr_threads for each thread in the * group without PF_POSTCOREDUMP set. */ spin_lock_irq(&tsk->sighand->siglock); tsk->flags |= PF_POSTCOREDUMP; core_state = tsk->signal->core_state; spin_unlock_irq(&tsk->sighand->siglock); if (core_state) { struct core_thread self; self.task = current; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); else self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_current_state(TASK_IDLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; schedule(); } __set_current_state(TASK_RUNNING); } } #ifdef CONFIG_MEMCG /* drops tasklist_lock if succeeds */ static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) { bool ret = false; task_lock(tsk); if (likely(tsk->mm == mm)) { /* tsk can't pass exit_mm/exec_mmap and exit */ read_unlock(&tasklist_lock); WRITE_ONCE(mm->owner, tsk); lru_gen_migrate_mm(mm); ret = true; } task_unlock(tsk); return ret; } static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) { struct task_struct *t; for_each_thread(g, t) { struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm == mm) { if (__try_to_set_owner(t, mm)) return true; } else if (t_mm) break; } return false; } /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *g, *p = current; /* * If the exiting or execing task is not the owner, it's * someone else's problem. */ if (mm->owner != p) return; /* * The current owner is exiting/execing and there are no other * candidates. Do not leave the mm pointing to a possibly * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { WRITE_ONCE(mm->owner, NULL); return; } read_lock(&tasklist_lock); /* * Search in the children */ list_for_each_entry(g, &p->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search in the siblings */ list_for_each_entry(g, &p->real_parent->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search through everything else, we should not get here often. */ for_each_process(g) { if (atomic_read(&mm->mm_users) <= 1) break; if (g->flags & PF_KTHREAD) continue; if (try_to_set_owner(g, mm)) goto ret; } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); ret: return; } #endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ static void exit_mm(void) { struct mm_struct *mm = current->mm; exit_mm_release(current, mm); if (!mm) return; mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); /* * When a thread stops operating on an address space, the loop * in membarrier_private_expedited() may not observe that * tsk->mm, and the loop in membarrier_global_expedited() may * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED * rq->membarrier_state, so those would not issue an IPI. * Membarrier requires a memory barrier after accessing * user-space memory, before clearing tsk->mm or the * rq->membarrier_state. */ smp_mb__after_spinlock(); local_irq_disable(); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); local_irq_enable(); task_unlock(current); mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) { struct task_struct *t; for_each_thread(p, t) { if (!(t->flags & PF_EXITING)) return t; } return NULL; } static struct task_struct *find_child_reaper(struct task_struct *father, struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; struct task_struct *p, *n; if (likely(reaper != father)) return reaper; reaper = find_alive_thread(father); if (reaper) { pid_ns->child_reaper = reaper; return reaper; } write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); return father; } /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists * 2. give it to the first ancestor process which prctl'd itself as a * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father, struct task_struct *child_reaper) { struct task_struct *thread, *reaper; thread = find_alive_thread(father); if (thread) return thread; if (father->signal->has_child_subreaper) { unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. * We can't check reaper != child_reaper to ensure we do not * cross the namespaces, the exiting parent could be injected * by setns() + fork(). * We check pid->level, this is slightly more efficient than * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ for (reaper = father->real_parent; task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; thread = find_alive_thread(reaper); if (thread) return thread; } } return child_reaper; } /* * Any that need to be release_task'd are put on the @dead list. */ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } /* * This does two things: * * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) { struct task_struct *p, *t, *reaper; if (unlikely(!list_empty(&father->ptraced))) exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { RCU_INIT_POINTER(t->real_parent, reaper); BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t, PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. */ if (!same_thread_group(reaper, father)) reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); } /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; struct task_struct *p, *n; LIST_HEAD(dead); write_lock_irq(&tasklist_lock); forget_original_parent(tsk, &dead); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; /* * sub-thread or delay_group_leader(), wake up the * PIDFD_THREAD waiters. */ if (!thread_group_empty(tsk)) do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && !ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD; autoreap = do_notify_parent(tsk, sig); } else if (thread_group_leader(tsk)) { autoreap = thread_group_empty(tsk) && do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; } if (autoreap) { tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } } #ifdef CONFIG_DEBUG_STACK_USAGE unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ # ifdef CONFIG_STACK_GROWSUP n--; # else n++; # endif } while (!*n); # ifdef CONFIG_STACK_GROWSUP return (unsigned long)end_of_stack(p) - (unsigned long)n; # else return (unsigned long)n - (unsigned long)end_of_stack(p); # endif } /* Count the maximum pages reached in kernel stacks */ static inline void kstack_histogram(unsigned long used_stack) { #ifdef CONFIG_VM_EVENT_COUNTERS if (used_stack <= 1024) count_vm_event(KSTACK_1K); #if THREAD_SIZE > 1024 else if (used_stack <= 2048) count_vm_event(KSTACK_2K); #endif #if THREAD_SIZE > 2048 else if (used_stack <= 4096) count_vm_event(KSTACK_4K); #endif #if THREAD_SIZE > 4096 else if (used_stack <= 8192) count_vm_event(KSTACK_8K); #endif #if THREAD_SIZE > 8192 else if (used_stack <= 16384) count_vm_event(KSTACK_16K); #endif #if THREAD_SIZE > 16384 else if (used_stack <= 32768) count_vm_event(KSTACK_32K); #endif #if THREAD_SIZE > 32768 else if (used_stack <= 65536) count_vm_event(KSTACK_64K); #endif #if THREAD_SIZE > 65536 else count_vm_event(KSTACK_REST); #endif #endif /* CONFIG_VM_EVENT_COUNTERS */ } static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; free = stack_not_used(current); kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); } #else static inline void check_stack_usage(void) {} #endif static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; spin_lock_irq(&sighand->siglock); signal->quick_threads--; if ((signal->quick_threads == 0) && !(signal->flags & SIGNAL_GROUP_EXIT)) { signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = code; signal->group_stop_count = 0; } spin_unlock_irq(&sighand->siglock); } void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; WARN_ON(irqs_disabled()); synchronize_group_exit(tsk, code); WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { /* * If the last thread of global init has exited, panic * immediately to get a useable coredump. */ if (unlikely(is_global_init(tsk))) panic("Attempted to kill init! exitcode=0x%08x\n", tsk->signal->group_exit_code ?: (int)code); #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(); if (group_dead) acct_process(); trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. * * because of cgroup mode, must be called before cgroup_exit() */ perf_event_exit_task(tsk); sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(); if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); exit_tasks_rcu_finish(); lockdep_free_task(tsk); do_task_dead(); } void __noreturn make_task_dead(int signr) { /* * Take the task off the cpu after something catastrophic has * happened. * * We can get here from a kernel oops, sometimes with preemption off. * Start by checking for critical errors. * Then fix up important state like USER_DS and preemption. * Then do everything else. */ struct task_struct *tsk = current; unsigned int limit; if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); if (unlikely(irqs_disabled())) { pr_info("note: %s[%d] exited with irqs disabled\n", current->comm, task_pid_nr(current)); local_irq_enable(); } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); } /* * Every time the system oopses, if the oops happens while a reference * to an object was held, the reference leaks. * If the oops doesn't also leak memory, repeated oopsing can cause * reference counters to wrap around (if they're not using refcount_t). * This means that repeated oopsing can make unexploitable-looking bugs * exploitable through repeated oopsing. * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ limit = READ_ONCE(oops_limit); if (atomic_inc_return(&oops_count) >= limit && limit) panic("Oopsed too often (kernel.oops_limit is %d)", limit); /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); do_task_dead(); } do_exit(signr); } SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); } do_exit(exit_code); /* NOTREACHED */ } /* * this kills every thread in the thread group. Note that any externally * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ return 0; } static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; } static int eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; /* * Wait for all children (clone and not) if __WALL is set or * if it is traced by us. */ if (ptrace || (wo->wo_flags & __WALL)) return 1; /* * Otherwise, wait for clone children *only* if __WCLONE is set; * otherwise, wait for non-clone children *only*. * * Note: a "clone" child here is one that reports to its parent * using a signal other than SIGCHLD, or a non-leader thread which * we can only see if it is traced by us. */ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; } /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ state = (ptrace_reparented(p) && thread_group_leader(p)) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* * We own this thread, nobody else can reap it. */ read_unlock(&tasklist_lock); sched_annotate_sleep(); /* * Check thread_group_leader() to exclude the traced sub-threads. */ if (state == EXIT_DEAD && thread_group_leader(p)) { struct signal_struct *sig = p->signal; struct signal_struct *psig = current->signal; unsigned long maxrss; u64 tgutime, tgstime; /* * The resource counters for the group leader are in its * own task_struct. Those for dead threads in the group * are in its signal_struct, as are those for the child * processes it has previously reaped. All these * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these * p->signal fields because the whole thread group is dead * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += p->maj_flt + sig->maj_flt + sig->cmaj_flt; psig->cnvcsw += p->nvcsw + sig->nvcsw + sig->cnvcsw; psig->cnivcsw += p->nivcsw + sig->nivcsw + sig->cnivcsw; psig->cinblock += task_io_get_inblock(p) + sig->inblock + sig->cinblock; psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; maxrss = max(sig->maxrss, sig->cmaxrss); if (psig->cmaxrss < maxrss) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* If parent wants a zombie, don't release it now */ state = EXIT_ZOMBIE; if (do_notify_parent(p, p->exit_signal)) state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } if (state == EXIT_DEAD) release_task(p); out_info: infop = wo->wo_info; if (infop) { if ((status & 0x7f) == 0) { infop->cause = CLD_EXITED; infop->status = status >> 8; } else { infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; infop->status = status & 0x7f; } infop->pid = pid; infop->uid = uid; } return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) return &p->signal->group_exit_code; } return NULL; } /** * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED * @wo: wait options * @ptrace: is the wait for ptrace * @p: task to wait for * * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. * * CONTEXT: * read_lock(&tasklist_lock), which is released if return value is * non-zero. Also, grabs and releases @p->sighand->siglock. * * RETURNS: * 0 if wait condition didn't exist and search for other wait conditions * should continue. Non-zero return, -errno on failure and @p's pid on * success, implies that tasklist_lock is released and wait condition * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { struct waitid_info *infop; int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; /* * Traditionally we see ptrace'd stopped tasks regardless of options. */ if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; if (!task_stopped_code(p, ptrace)) return 0; exit_code = 0; spin_lock_irq(&p->sighand->siglock); p_code = task_stopped_code(p, ptrace); if (unlikely(!p_code)) goto unlock_sig; exit_code = *p_code; if (!exit_code) goto unlock_sig; if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) return 0; /* * Now we are pretty sure this task is interesting. * Make sure it doesn't get reaped out from under us while we * give up the lock and then examine it below. We don't want to * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ get_task_struct(p); pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); if (likely(!(wo->wo_flags & WNOWAIT))) wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { infop->cause = why; infop->status = exit_code; infop->pid = pid; infop->uid = uid; } return pid; } /* * Handle do_wait work for one task in a live, non-stopped state. * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { struct waitid_info *infop; pid_t pid; uid_t uid; if (!unlikely(wo->wo_flags & WCONTINUED)) return 0; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; spin_lock_irq(&p->sighand->siglock); /* Re-check with the lock held. */ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { spin_unlock_irq(&p->sighand->siglock); return 0; } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; if (!infop) { wo->wo_stat = 0xffff; } else { infop->cause = CLD_CONTINUED; infop->pid = pid; infop->uid = uid; infop->status = SIGCONT; } return pid; } /* * Consider @p for a wait by @parent. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { /* * We can race with wait_task_zombie() from another thread. * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, ptrace, p); if (!ret) return ret; if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. */ if (likely(!ptrace)) wo->notask_error = 0; return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { /* * If it is traced by its real parent's group, just pretend * the caller is ptrace_do_wait() and reap this child if it * is zombie. * * This also hides group stop state from real parent; otherwise * a single stop can be reported twice as group and ptrace stop. * If a ptracer wants to distinguish these two events for its * own children it should create a separate process which takes * the role of real parent. */ if (!ptrace_reparented(p)) ptrace = 1; } /* slay zombie? */ if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* * A zombie ptracee is only visible to its ptracer. * Notification and reaping will be cascaded to the * real parent when the ptracer detaches. */ if (unlikely(ptrace) || likely(!p->ptrace)) return wait_task_zombie(wo, p); } /* * Allow access to stopped/continued state via zombie by * falling through. Clearing of notask_error is complex. * * When !@ptrace: * * If WEXITED is set, notask_error should naturally be * cleared. If not, subset of WSTOPPED|WCONTINUED is set, * so, if there are live subthreads, there are events to * wait for. If all subthreads are dead, it's still safe * to clear - this function will be called again in finite * amount time once all the subthreads are released and * will then return without clearing. * * When @ptrace: * * Stopped state is per-task and thus can't change once the * target task dies. Only continued and exited can happen. * Clear notask_error if WCONTINUED | WEXITED. */ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ wo->notask_error = 0; } /* * Wait for stopped. Depending on @ptrace, different stopped state * is used and the two don't interact with each other. */ ret = wait_task_stopped(wo, ptrace, p); if (ret) return ret; /* * Wait for continued. There's only one continued state and the * ptracer can consume it which can confuse the real parent. Don't * use WCONTINUED from ptracer. You don't need or want it. */ return wait_task_continued(wo, p); } /* * Do the work of do_wait() for one thread in the group, @tsk. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } return 0; } static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } return 0; } bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) { if (!eligible_pid(wo, p)) return false; if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) return false; return true; } static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct task_struct *p = key; if (pid_child_should_wake(wo, p)) return default_wake_function(wait, mode, sync, key); return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, TASK_INTERRUPTIBLE, p); } static bool is_effectively_child(struct wait_opts *wo, bool ptrace, struct task_struct *target) { struct task_struct *parent = !ptrace ? target->real_parent : target->parent; return current == parent || (!(wo->wo_flags & __WNOTHREAD) && same_thread_group(current, parent)); } /* * Optimization for waiting on PIDTYPE_PID. No need to iterate through child * and tracee lists to find the target task. */ static int do_wait_pid(struct wait_opts *wo) { bool ptrace; struct task_struct *target; int retval; ptrace = false; target = pid_task(wo->wo_pid, PIDTYPE_TGID); if (target && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } ptrace = true; target = pid_task(wo->wo_pid, PIDTYPE_PID); if (target && target->ptrace && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } return 0; } long __do_wait(struct wait_opts *wo) { long retval; /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; read_lock(&tasklist_lock); if (wo->wo_type == PIDTYPE_PID) { retval = do_wait_pid(wo); if (retval) return retval; } else { struct task_struct *tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) return retval; retval = ptrace_do_wait(wo, tsk); if (retval) return retval; if (wo->wo_flags & __WNOTHREAD) break; } while_each_thread(current, tsk); } read_unlock(&tasklist_lock); notask: retval = wo->notask_error; if (!retval && !(wo->wo_flags & WNOHANG)) return -ERESTARTSYS; return retval; } static long do_wait(struct wait_opts *wo) { int retval; trace_sched_process_wait(wo->wo_pid); init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); wo->child_wait.private = current; add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); do { set_current_state(TASK_INTERRUPTIBLE); retval = __do_wait(wo); if (retval != -ERESTARTSYS) break; if (signal_pending(current)) break; schedule(); } while (1); __set_current_state(TASK_RUNNING); remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); return retval; } int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; switch (which) { case P_ALL: type = PIDTYPE_MAX; break; case P_PID: type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; if (upid < 0) return -EINVAL; if (upid) pid = find_get_pid(upid); else pid = get_task_pid(current, PIDTYPE_PGID); break; case P_PIDFD: type = PIDTYPE_PID; if (upid < 0) return -EINVAL; pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); break; default: return -EINVAL; } wo->wo_type = type; wo->wo_pid = pid; wo->wo_flags = options; wo->wo_info = infop; wo->wo_rusage = ru; if (f_flags & O_NONBLOCK) wo->wo_flags |= WNOHANG; return 0; } static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; long ret; ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); if (ret) return ret; ret = do_wait(&wo); if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) ret = -EAGAIN; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } long kernel_wait4(pid_t upid, int __user *stat_addr, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; /* -INT_MIN is not defined */ if (upid == INT_MIN) return -ESRCH; if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { type = PIDTYPE_PGID; pid = find_get_pid(-upid); } else if (upid == 0) { type = PIDTYPE_PGID; pid = get_task_pid(current, PIDTYPE_PGID); } else /* upid > 0 */ { type = PIDTYPE_PID; pid = find_get_pid(upid); } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) ret = -EFAULT; return ret; } int kernel_wait(pid_t pid, int *stat) { struct wait_opts wo = { .wo_type = PIDTYPE_PID, .wo_pid = find_get_pid(pid), .wo_flags = WEXITED, }; int ret; ret = do_wait(&wo); if (ret > 0 && wo.wo_stat) *stat = wo.wo_stat; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { struct rusage r; long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } return err; } #ifdef __ARCH_WANT_SYS_WAITPID /* * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return kernel_wait4(pid, stat_addr, options, NULL); } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, int, options, struct compat_rusage __user *, ru) { struct rusage r; long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && put_compat_rusage(&r, ru)) return -EFAULT; } return err; } COMPAT_SYSCALL_DEFINE5(waitid, int, which, compat_pid_t, pid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (uru) { /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) err = copy_to_user(uru, &ru, sizeof(ru)); else err = put_compat_rusage(&ru, uru); if (err) return -EFAULT; } } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } #endif /* * This needs to be __function_aligned as GCC implicitly makes any * implementation of abort() cold and drops alignment specified by * -falign-functions=N. * * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 */ __weak __function_aligned void abort(void) { BUG(); /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } EXPORT_SYMBOL(abort);
6 7 7 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0-or-later /* * SNAP data link layer. Derived from 802.2 * * Alan Cox <alan@lxorguk.ukuu.org.uk>, * from the 802.2 layer by Greg Page. * Merged in additions from Greg Page's psnap.c. */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/datalink.h> #include <net/llc.h> #include <net/psnap.h> #include <linux/mm.h> #include <linux/in.h> #include <linux/init.h> #include <linux/rculist.h> static LIST_HEAD(snap_list); static DEFINE_SPINLOCK(snap_lock); static struct llc_sap *snap_sap; /* * Find a snap client by matching the 5 bytes. */ static struct datalink_proto *find_snap_client(const unsigned char *desc) { struct datalink_proto *proto = NULL, *p; list_for_each_entry_rcu(p, &snap_list, node, lockdep_is_held(&snap_lock)) { if (!memcmp(p->type, desc, 5)) { proto = p; break; } } return proto; } /* * A SNAP packet has arrived */ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { int rc = 1; struct datalink_proto *proto; static struct packet_type snap_packet_type = { .type = cpu_to_be16(ETH_P_SNAP), }; if (unlikely(!pskb_may_pull(skb, 5))) goto drop; rcu_read_lock(); proto = find_snap_client(skb_transport_header(skb)); if (proto) { /* Pass the frame on. */ skb->transport_header += 5; skb_pull_rcsum(skb, 5); rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } rcu_read_unlock(); if (unlikely(!proto)) goto drop; out: return rc; drop: kfree_skb(skb); goto out; } /* * Put a SNAP header on a frame and pass to 802.2 */ static int snap_request(struct datalink_proto *dl, struct sk_buff *skb, const u8 *dest) { memcpy(skb_push(skb, 5), dl->type, 5); llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap); return 0; } /* * Set up the SNAP layer */ EXPORT_SYMBOL(register_snap_client); EXPORT_SYMBOL(unregister_snap_client); static const char snap_err_msg[] __initconst = KERN_CRIT "SNAP - unable to register with 802.2\n"; static int __init snap_init(void) { snap_sap = llc_sap_open(0xAA, snap_rcv); if (!snap_sap) { printk(snap_err_msg); return -EBUSY; } return 0; } module_init(snap_init); static void __exit snap_exit(void) { llc_sap_put(snap_sap); } module_exit(snap_exit); /* * Register SNAP clients. We don't yet use this for IP. */ struct datalink_proto *register_snap_client(const unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *)) { struct datalink_proto *proto = NULL; spin_lock_bh(&snap_lock); if (find_snap_client(desc)) goto out; proto = kmalloc(sizeof(*proto), GFP_ATOMIC); if (proto) { memcpy(proto->type, desc, 5); proto->rcvfunc = rcvfunc; proto->header_length = 5 + 3; /* snap + 802.2 */ proto->request = snap_request; list_add_rcu(&proto->node, &snap_list); } out: spin_unlock_bh(&snap_lock); return proto; } /* * Unregister SNAP clients. Protocols no longer want to play with us ... */ void unregister_snap_client(struct datalink_proto *proto) { spin_lock_bh(&snap_lock); list_del_rcu(&proto->node); spin_unlock_bh(&snap_lock); synchronize_net(); kfree(proto); } MODULE_DESCRIPTION("SNAP data link layer. Derived from 802.2"); MODULE_LICENSE("GPL");
12 12 15 10 9 2 12 12 12 12 12 12 12 12 12 12 5 7 12 12 2 12 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "bat_iv_ogm.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> #include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/string_choices.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bitarray.h" #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work); /** * enum batadv_dup_status - duplicate status */ enum batadv_dup_status { /** @BATADV_NO_DUP: the packet is no duplicate */ BATADV_NO_DUP = 0, /** * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for * the neighbor) */ BATADV_ORIG_DUP, /** @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor */ BATADV_NEIGH_DUP, /** * @BATADV_PROTECTED: originator is currently protected (after reboot) */ BATADV_PROTECTED, }; /** * batadv_ring_buffer_set() - update the ring buffer with the given value * @lq_recv: pointer to the ring buffer * @lq_index: index to store the value at * @value: value to store in the ring buffer */ static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value) { lq_recv[*lq_index] = value; *lq_index = (*lq_index + 1) % BATADV_TQ_GLOBAL_WINDOW_SIZE; } /** * batadv_ring_buffer_avg() - compute the average of all non-zero values stored * in the given ring buffer * @lq_recv: pointer to the ring buffer * * Return: computed average value. */ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) { const u8 *ptr; u16 count = 0; u16 i = 0; u16 sum = 0; ptr = lq_recv; while (i < BATADV_TQ_GLOBAL_WINDOW_SIZE) { if (*ptr != 0) { count++; sum += *ptr; } i++; ptr++; } if (count == 0) return 0; return (u8)(sum / count); } /** * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an * originator * @bat_priv: the bat priv with all the soft interface information * @addr: mac address of the originator * * Return: the originator object corresponding to the passed mac address or NULL * on failure. * If the object does not exist, it is created and initialised. */ static struct batadv_orig_node * batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int hash_added; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) return orig_node; orig_node = batadv_orig_node_new(bat_priv, addr); if (!orig_node) return NULL; spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock); kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) goto free_orig_node_hash; return orig_node; free_orig_node_hash: /* reference for batadv_hash_add */ batadv_orig_node_put(orig_node); /* reference from batadv_orig_node_new */ batadv_orig_node_put(orig_node); return NULL; } static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh) { struct batadv_neigh_node *neigh_node; neigh_node = batadv_neigh_node_get_or_create(orig_node, hard_iface, neigh_addr); if (!neigh_node) goto out; neigh_node->orig_node = orig_neigh; out: return neigh_node; } static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; unsigned char *ogm_buff; u32 random_seqno; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&hard_iface->bat_iv.ogm_seqno, random_seqno); hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN; ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC); if (!ogm_buff) { mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); return -ENOMEM; } hard_iface->bat_iv.ogm_buff = ogm_buff; batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; batadv_ogm_packet->packet_type = BATADV_IV_OGM; batadv_ogm_packet->version = BATADV_COMPAT_VERSION; batadv_ogm_packet->ttl = 2; batadv_ogm_packet->flags = BATADV_NO_FLAGS; batadv_ogm_packet->reserved = 0; batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE; mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); return 0; } static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); kfree(hard_iface->bat_iv.ogm_buff); hard_iface->bat_iv.ogm_buff = NULL; mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; void *ogm_buff; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); ogm_buff = hard_iface->bat_iv.ogm_buff; if (!ogm_buff) goto unlock; batadv_ogm_packet = ogm_buff; ether_addr_copy(batadv_ogm_packet->orig, hard_iface->net_dev->dev_addr); ether_addr_copy(batadv_ogm_packet->prev_sender, hard_iface->net_dev->dev_addr); unlock: mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; void *ogm_buff; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); ogm_buff = hard_iface->bat_iv.ogm_buff; if (!ogm_buff) goto unlock; batadv_ogm_packet = ogm_buff; batadv_ogm_packet->ttl = BATADV_TTL; unlock: mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } /* when do we schedule our own ogm to be sent */ static unsigned long batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) { unsigned int msecs; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; msecs += get_random_u32_below(2 * BATADV_JITTER); return jiffies + msecs_to_jiffies(msecs); } /* when do we schedule a ogm packet to be sent */ static unsigned long batadv_iv_ogm_fwd_send_time(void) { return jiffies + msecs_to_jiffies(get_random_u32_below(BATADV_JITTER / 2)); } /* apply hop penalty for a normal link */ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) { int hop_penalty = atomic_read(&bat_priv->hop_penalty); int new_tq; new_tq = tq * (BATADV_TQ_MAX_VALUE - hop_penalty); new_tq /= BATADV_TQ_MAX_VALUE; return new_tq; } /** * batadv_iv_ogm_aggr_packet() - checks if there is another OGM attached * @buff_pos: current position in the skb * @packet_len: total length of the skb * @ogm_packet: potential OGM in buffer * * Return: true if there is enough space for another OGM, false otherwise. */ static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, const struct batadv_ogm_packet *ogm_packet) { int next_buff_pos = 0; /* check if there is enough space for the header */ next_buff_pos += buff_pos + sizeof(*ogm_packet); if (next_buff_pos > packet_len) return false; /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm_packet->tvlv_len); return (next_buff_pos <= packet_len) && (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); } /* send a batman ogm to a given interface */ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); const char *fwd_str; u8 packet_num; s16 buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; struct sk_buff *skb; u8 *packet_pos; if (hard_iface->if_status != BATADV_IF_ACTIVE) return; packet_num = 0; buff_pos = 0; packet_pos = forw_packet->skb->data; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; /* adjust all flags and log packets */ while (batadv_iv_ogm_aggr_packet(buff_pos, forw_packet->packet_len, batadv_ogm_packet)) { /* we might have aggregated direct link packets with an * ordinary base packet */ if (forw_packet->direct_link_flags & BIT(packet_num) && forw_packet->if_incoming == hard_iface) batadv_ogm_packet->flags |= BATADV_DIRECTLINK; else batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK; if (packet_num > 0 || !forw_packet->own) fwd_str = "Forwarding"; else fwd_str = "Sending own"; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s %spacket (originator %pM, seqno %u, TQ %d, TTL %d, IDF %s) on interface %s [%pM]\n", fwd_str, (packet_num > 0 ? "aggregated " : ""), batadv_ogm_packet->orig, ntohl(batadv_ogm_packet->seqno), batadv_ogm_packet->tq, batadv_ogm_packet->ttl, str_on_off(batadv_ogm_packet->flags & BATADV_DIRECTLINK), hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); buff_pos += BATADV_OGM_HLEN; buff_pos += ntohs(batadv_ogm_packet->tvlv_len); packet_num++; packet_pos = forw_packet->skb->data + buff_pos; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; } /* create clone because function is called more than once */ skb = skb_clone(forw_packet->skb, GFP_ATOMIC); if (skb) { batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, skb->len + ETH_HLEN); batadv_send_broadcast_skb(skb, hard_iface); } } /* send a batman ogm packet */ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) { struct net_device *soft_iface; if (!forw_packet->if_incoming) { pr_err("Error - can't forward packet: incoming iface not specified\n"); return; } soft_iface = forw_packet->if_incoming->soft_iface; if (WARN_ON(!forw_packet->if_outgoing)) return; if (forw_packet->if_outgoing->soft_iface != soft_iface) { pr_warn("%s: soft interface switch for queued OGM\n", __func__); return; } if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) return; /* only for one specific outgoing interface */ batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing); } /** * batadv_iv_ogm_can_aggregate() - find out if an OGM can be aggregated on an * existing forward packet * @new_bat_ogm_packet: OGM packet to be aggregated * @bat_priv: the bat priv with all the soft interface information * @packet_len: (total) length of the OGM * @send_time: timestamp (jiffies) when the packet is to be sent * @directlink: true if this is a direct link packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @forw_packet: the forwarded packet which should be checked * * Return: true if new_packet can be aggregated with forw_packet */ static bool batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, struct batadv_priv *bat_priv, int packet_len, unsigned long send_time, bool directlink, const struct batadv_hard_iface *if_incoming, const struct batadv_hard_iface *if_outgoing, const struct batadv_forw_packet *forw_packet) { struct batadv_ogm_packet *batadv_ogm_packet; int aggregated_bytes = forw_packet->packet_len + packet_len; struct batadv_hard_iface *primary_if = NULL; bool res = false; unsigned long aggregation_end_time; batadv_ogm_packet = (struct batadv_ogm_packet *)forw_packet->skb->data; aggregation_end_time = send_time; aggregation_end_time += msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); /* we can aggregate the current packet to this aggregated packet * if: * * - the send time is within our MAX_AGGREGATION_MS time * - the resulting packet won't be bigger than * MAX_AGGREGATION_BYTES * otherwise aggregation is not possible */ if (!time_before(send_time, forw_packet->send_time) || !time_after_eq(aggregation_end_time, forw_packet->send_time)) return false; if (aggregated_bytes > BATADV_MAX_AGGREGATION_BYTES) return false; /* packet is not leaving on the same interface. */ if (forw_packet->if_outgoing != if_outgoing) return false; /* check aggregation compatibility * -> direct link packets are broadcasted on * their interface only * -> aggregate packet if the current packet is * a "global" packet as well as the base * packet */ primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return false; /* packets without direct link flag and high TTL * are flooded through the net */ if (!directlink && !(batadv_ogm_packet->flags & BATADV_DIRECTLINK) && batadv_ogm_packet->ttl != 1 && /* own packets originating non-primary * interfaces leave only that interface */ (!forw_packet->own || forw_packet->if_incoming == primary_if)) { res = true; goto out; } /* if the incoming packet is sent via this one * interface only - we still can aggregate */ if (directlink && new_bat_ogm_packet->ttl == 1 && forw_packet->if_incoming == if_incoming && /* packets from direct neighbors or * own secondary interface packets * (= secondary interface packets in general) */ (batadv_ogm_packet->flags & BATADV_DIRECTLINK || (forw_packet->own && forw_packet->if_incoming != primary_if))) { res = true; goto out; } out: batadv_hardif_put(primary_if); return res; } /** * batadv_iv_ogm_aggregate_new() - create a new aggregated packet and add this * packet to it. * @packet_buff: pointer to the OGM * @packet_len: (total) length of the OGM * @send_time: timestamp (jiffies) when the packet is to be sent * @direct_link: whether this OGM has direct link status * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm */ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, int packet_len, unsigned long send_time, bool direct_link, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, int own_packet) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_forw_packet *forw_packet_aggr; struct sk_buff *skb; unsigned char *skb_buff; unsigned int skb_size; atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left; if (atomic_read(&bat_priv->aggregated_ogms) && packet_len < BATADV_MAX_AGGREGATION_BYTES) skb_size = BATADV_MAX_AGGREGATION_BYTES; else skb_size = packet_len; skb_size += ETH_HLEN; skb = netdev_alloc_skb_ip_align(NULL, skb_size); if (!skb) return; forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing, queue_left, bat_priv, skb); if (!forw_packet_aggr) { kfree_skb(skb); return; } forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; skb_reserve(forw_packet_aggr->skb, ETH_HLEN); skb_buff = skb_put(forw_packet_aggr->skb, packet_len); forw_packet_aggr->packet_len = packet_len; memcpy(skb_buff, packet_buff, packet_len); forw_packet_aggr->own = own_packet; forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS; forw_packet_aggr->send_time = send_time; /* save packet direct link flag status */ if (direct_link) forw_packet_aggr->direct_link_flags |= 1; INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work, batadv_iv_send_outstanding_bat_ogm_packet); batadv_forw_packet_ogmv1_queue(bat_priv, forw_packet_aggr, send_time); } /* aggregate a new packet into the existing ogm packet */ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr, const unsigned char *packet_buff, int packet_len, bool direct_link) { unsigned long new_direct_link_flag; skb_put_data(forw_packet_aggr->skb, packet_buff, packet_len); forw_packet_aggr->packet_len += packet_len; forw_packet_aggr->num_packets++; /* save packet direct link flag status */ if (direct_link) { new_direct_link_flag = BIT(forw_packet_aggr->num_packets); forw_packet_aggr->direct_link_flags |= new_direct_link_flag; } } /** * batadv_iv_ogm_queue_add() - queue up an OGM for transmission * @bat_priv: the bat priv with all the soft interface information * @packet_buff: pointer to the OGM * @packet_len: (total) length of the OGM * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm * @send_time: timestamp (jiffies) when the packet is to be sent */ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, unsigned char *packet_buff, int packet_len, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, int own_packet, unsigned long send_time) { /* _aggr -> pointer to the packet we want to aggregate with * _pos -> pointer to the position in the queue */ struct batadv_forw_packet *forw_packet_aggr = NULL; struct batadv_forw_packet *forw_packet_pos = NULL; struct batadv_ogm_packet *batadv_ogm_packet; bool direct_link; unsigned long max_aggregation_jiffies; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff; direct_link = !!(batadv_ogm_packet->flags & BATADV_DIRECTLINK); max_aggregation_jiffies = msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); /* find position for the packet in the forward queue */ spin_lock_bh(&bat_priv->forw_bat_list_lock); /* own packets are not to be aggregated */ if (atomic_read(&bat_priv->aggregated_ogms) && !own_packet) { hlist_for_each_entry(forw_packet_pos, &bat_priv->forw_bat_list, list) { if (batadv_iv_ogm_can_aggregate(batadv_ogm_packet, bat_priv, packet_len, send_time, direct_link, if_incoming, if_outgoing, forw_packet_pos)) { forw_packet_aggr = forw_packet_pos; break; } } } /* nothing to aggregate with - either aggregation disabled or no * suitable aggregation packet found */ if (!forw_packet_aggr) { /* the following section can run without the lock */ spin_unlock_bh(&bat_priv->forw_bat_list_lock); /* if we could not aggregate this packet with one of the others * we hold it back for a while, so that it might be aggregated * later on */ if (!own_packet && atomic_read(&bat_priv->aggregated_ogms)) send_time += max_aggregation_jiffies; batadv_iv_ogm_aggregate_new(packet_buff, packet_len, send_time, direct_link, if_incoming, if_outgoing, own_packet); } else { batadv_iv_ogm_aggregate(forw_packet_aggr, packet_buff, packet_len, direct_link); spin_unlock_bh(&bat_priv->forw_bat_list_lock); } } static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, const struct ethhdr *ethhdr, struct batadv_ogm_packet *batadv_ogm_packet, bool is_single_hop_neigh, bool is_from_best_next_hop, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); u16 tvlv_len; if (batadv_ogm_packet->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); return; } if (!is_from_best_next_hop) { /* Mark the forwarded packet when it is not coming from our * best next hop. We still need to forward the packet for our * neighbor link quality detection to work in case the packet * originated from a single hop neighbor. Otherwise we can * simply drop the ogm. */ if (is_single_hop_neigh) batadv_ogm_packet->flags |= BATADV_NOT_BEST_NEXT_HOP; else return; } tvlv_len = ntohs(batadv_ogm_packet->tvlv_len); batadv_ogm_packet->ttl--; ether_addr_copy(batadv_ogm_packet->prev_sender, ethhdr->h_source); /* apply hop penalty */ batadv_ogm_packet->tq = batadv_hop_penalty(batadv_ogm_packet->tq, bat_priv); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: tq: %i, ttl: %i\n", batadv_ogm_packet->tq, batadv_ogm_packet->ttl); if (is_single_hop_neigh) batadv_ogm_packet->flags |= BATADV_DIRECTLINK; else batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK; batadv_iv_ogm_queue_add(bat_priv, (unsigned char *)batadv_ogm_packet, BATADV_OGM_HLEN + tvlv_len, if_incoming, if_outgoing, 0, batadv_iv_ogm_fwd_send_time()); } /** * batadv_iv_ogm_slide_own_bcast_window() - bitshift own OGM broadcast windows * for the given interface * @hard_iface: the interface for which the windows have to be shifted */ static void batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; unsigned long *word; u32 i; u8 *w; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { hlist_for_each_entry_rcu(orig_ifinfo, &orig_node->ifinfo_list, list) { if (orig_ifinfo->if_outgoing != hard_iface) continue; spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); word = orig_ifinfo->bat_iv.bcast_own; batadv_bit_get_packet(bat_priv, word, 1, 0); w = &orig_ifinfo->bat_iv.bcast_own_sum; *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE); spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); } } rcu_read_unlock(); } } /** * batadv_iv_ogm_schedule_buff() - schedule submission of hardif ogm buffer * @hard_iface: interface whose ogm buffer should be transmitted */ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff; struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *primary_if, *tmp_hard_iface; int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len; u32 seqno; u16 tvlv_len = 0; unsigned long send_time; lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex); /* interface already disabled by batadv_iv_ogm_iface_disable */ if (!*ogm_buff) return; /* the interface gets activated here to avoid race conditions between * the moment of activating the interface in * hardif_activate_interface() where the originator mac is set and * outdated packets (especially uninitialized mac addresses) in the * packet queue */ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) hard_iface->if_status = BATADV_IF_ACTIVE; primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { /* tt changes have to be committed before the tvlv data is * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff, ogm_buff_len, BATADV_OGM_HLEN); } batadv_ogm_packet = (struct batadv_ogm_packet *)(*ogm_buff); batadv_ogm_packet->tvlv_len = htons(tvlv_len); /* change sequence number to network order */ seqno = (u32)atomic_read(&hard_iface->bat_iv.ogm_seqno); batadv_ogm_packet->seqno = htonl(seqno); atomic_inc(&hard_iface->bat_iv.ogm_seqno); batadv_iv_ogm_slide_own_bcast_window(hard_iface); send_time = batadv_iv_ogm_emit_send_time(bat_priv); if (hard_iface != primary_if) { /* OGMs from secondary interfaces are only scheduled on their * respective interfaces. */ batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, hard_iface, 1, send_time); goto out; } /* OGMs from primary interfaces are scheduled on all * interfaces. */ rcu_read_lock(); list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) { if (tmp_hard_iface->soft_iface != hard_iface->soft_iface) continue; if (!kref_get_unless_zero(&tmp_hard_iface->refcount)) continue; batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, tmp_hard_iface, 1, send_time); batadv_hardif_put(tmp_hard_iface); } rcu_read_unlock(); out: batadv_hardif_put(primary_if); } static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) { if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) return; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); batadv_iv_ogm_schedule_buff(hard_iface); mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } /** * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface * @orig_node: originator which reproadcasted the OGMs directly * @if_outgoing: interface which transmitted the original OGM and received the * direct rebroadcast * * Return: Number of replied (rebroadcasted) OGMs which were transmitted by * an originator and directly (without intermediate hop) received by a specific * interface */ static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; u8 sum; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing); if (!orig_ifinfo) return 0; spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); sum = orig_ifinfo->bat_iv.bcast_own_sum; spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_ifinfo_put(orig_ifinfo); return sum; } /** * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: the orig node who originally emitted the ogm packet * @orig_ifinfo: ifinfo for the outgoing interface of the orig_node * @ethhdr: Ethernet header of the OGM * @batadv_ogm_packet: the ogm packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @dup_status: the duplicate status of this ogm packet. */ static void batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_orig_ifinfo *orig_ifinfo, const struct ethhdr *ethhdr, const struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, enum batadv_dup_status dup_status) { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *neigh_node = NULL; struct batadv_neigh_node *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; u8 sum_orig, sum_neigh; u8 *neigh_addr; u8 tq_avg; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s(): Searching and updating originator entry of received packet\n", __func__); rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { neigh_addr = tmp_neigh_node->addr; if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && tmp_neigh_node->if_incoming == if_incoming && kref_get_unless_zero(&tmp_neigh_node->refcount)) { if (WARN(neigh_node, "too many matching neigh_nodes")) batadv_neigh_node_put(neigh_node); neigh_node = tmp_neigh_node; continue; } if (dup_status != BATADV_NO_DUP) continue; /* only update the entry for this outgoing interface */ neigh_ifinfo = batadv_neigh_ifinfo_get(tmp_neigh_node, if_outgoing); if (!neigh_ifinfo) continue; spin_lock_bh(&tmp_neigh_node->ifinfo_lock); batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv, &neigh_ifinfo->bat_iv.tq_index, 0); tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv); neigh_ifinfo->bat_iv.tq_avg = tq_avg; spin_unlock_bh(&tmp_neigh_node->ifinfo_lock); batadv_neigh_ifinfo_put(neigh_ifinfo); neigh_ifinfo = NULL; } if (!neigh_node) { struct batadv_orig_node *orig_tmp; orig_tmp = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_tmp) goto unlock; neigh_node = batadv_iv_ogm_neigh_new(if_incoming, ethhdr->h_source, orig_node, orig_tmp); batadv_orig_node_put(orig_tmp); if (!neigh_node) goto unlock; } else { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Updating existing last-hop neighbor of originator\n"); } rcu_read_unlock(); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; neigh_node->last_seen = jiffies; spin_lock_bh(&neigh_node->ifinfo_lock); batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv, &neigh_ifinfo->bat_iv.tq_index, batadv_ogm_packet->tq); tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv); neigh_ifinfo->bat_iv.tq_avg = tq_avg; spin_unlock_bh(&neigh_node->ifinfo_lock); if (dup_status == BATADV_NO_DUP) { orig_ifinfo->last_ttl = batadv_ogm_packet->ttl; neigh_ifinfo->last_ttl = batadv_ogm_packet->ttl; } /* if this neighbor already is our next hop there is nothing * to change */ router = batadv_orig_router_get(orig_node, if_outgoing); if (router == neigh_node) goto out; if (router) { router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); if (!router_ifinfo) goto out; /* if this neighbor does not offer a better TQ we won't * consider it */ if (router_ifinfo->bat_iv.tq_avg > neigh_ifinfo->bat_iv.tq_avg) goto out; } /* if the TQ is the same and the link not more symmetric we * won't consider it either */ if (router_ifinfo && neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node, router->if_incoming); sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node, neigh_node->if_incoming); if (sum_orig >= sum_neigh) goto out; } batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); goto out; unlock: rcu_read_unlock(); out: batadv_neigh_node_put(neigh_node); batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(neigh_ifinfo); batadv_neigh_ifinfo_put(router_ifinfo); } /** * batadv_iv_ogm_calc_tq() - calculate tq for current received ogm packet * @orig_node: the orig node who originally emitted the ogm packet * @orig_neigh_node: the orig node struct of the neighbor who sent the packet * @batadv_ogm_packet: the ogm packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * * Return: true if the link can be considered bidirectional, false otherwise */ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int tq_iface_hop_penalty = BATADV_TQ_MAX_VALUE; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; bool ret = false; /* find corresponding one hop neighbor */ rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_node, &orig_neigh_node->neigh_list, list) { if (!batadv_compare_eth(tmp_neigh_node->addr, orig_neigh_node->orig)) continue; if (tmp_neigh_node->if_incoming != if_incoming) continue; if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; neigh_node = tmp_neigh_node; break; } rcu_read_unlock(); if (!neigh_node) neigh_node = batadv_iv_ogm_neigh_new(if_incoming, orig_neigh_node->orig, orig_neigh_node, orig_neigh_node); if (!neigh_node) goto out; /* if orig_node is direct neighbor update neigh_node last_seen */ if (orig_node == orig_neigh_node) neigh_node->last_seen = jiffies; orig_node->last_seen = jiffies; /* find packet count of corresponding one hop neighbor */ orig_eq_count = batadv_iv_orig_ifinfo_sum(orig_neigh_node, if_incoming); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (neigh_ifinfo) { neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count; batadv_neigh_ifinfo_put(neigh_ifinfo); } else { neigh_rq_count = 0; } /* pay attention to not get a value bigger than 100 % */ if (orig_eq_count > neigh_rq_count) total_count = neigh_rq_count; else total_count = orig_eq_count; /* if we have too few packets (too less data) we set tq_own to zero * if we receive too few packets it is not considered bidirectional */ if (total_count < BATADV_TQ_LOCAL_BIDRECT_SEND_MINIMUM || neigh_rq_count < BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM) tq_own = 0; else /* neigh_node->real_packet_count is never zero as we * only purge old information when getting new * information */ tq_own = (BATADV_TQ_MAX_VALUE * total_count) / neigh_rq_count; /* 1 - ((1-x) ** 3), normalized to TQ_MAX_VALUE this does * affect the nearly-symmetric links only a little, but * punishes asymmetric links more. This will give a value * between 0 and TQ_MAX_VALUE */ neigh_rq_inv = BATADV_TQ_LOCAL_WINDOW_SIZE - neigh_rq_count; neigh_rq_inv_cube = neigh_rq_inv * neigh_rq_inv * neigh_rq_inv; neigh_rq_max_cube = BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE; inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube; inv_asym_penalty /= neigh_rq_max_cube; tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty; tq_iface_hop_penalty -= atomic_read(&if_incoming->hop_penalty); /* penalize if the OGM is forwarded on the same interface. WiFi * interfaces and other half duplex devices suffer from throughput * drops as they can't send and receive at the same time. */ if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) tq_iface_hop_penalty = batadv_hop_penalty(tq_iface_hop_penalty, bat_priv); combined_tq = batadv_ogm_packet->tq * tq_own * tq_asym_penalty * tq_iface_hop_penalty; combined_tq /= BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE; batadv_ogm_packet->tq = combined_tq; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_hop_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", orig_node->orig, orig_neigh_node->orig, total_count, neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_hop_penalty, batadv_ogm_packet->tq, if_incoming->net_dev->name, if_outgoing ? if_outgoing->net_dev->name : "DEFAULT"); /* if link has the minimum required transmission quality * consider it bidirectional */ if (batadv_ogm_packet->tq >= BATADV_TQ_TOTAL_BIDRECT_LIMIT) ret = true; out: batadv_neigh_node_put(neigh_node); return ret; } /** * batadv_iv_ogm_update_seqnos() - process a batman packet for all interfaces, * adjust the sequence number and find out whether it is a duplicate * @ethhdr: ethernet header of the packet * @batadv_ogm_packet: OGM packet to be considered * @if_incoming: interface on which the OGM packet was received * @if_outgoing: interface for which the retransmission should be considered * * Return: duplicate status as enum batadv_dup_status */ static enum batadv_dup_status batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, const struct batadv_ogm_packet *batadv_ogm_packet, const struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; bool is_dup; s32 seq_diff; bool need_update = false; int set_mark; enum batadv_dup_status ret = BATADV_NO_DUP; u32 seqno = ntohl(batadv_ogm_packet->seqno); u8 *neigh_addr; u8 packet_count; unsigned long *bitmap; orig_node = batadv_iv_ogm_orig_get(bat_priv, batadv_ogm_packet->orig); if (!orig_node) return BATADV_NO_DUP; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (WARN_ON(!orig_ifinfo)) { batadv_orig_node_put(orig_node); return 0; } spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); seq_diff = seqno - orig_ifinfo->last_real_seqno; /* signalize caller that the packet is to be dropped. */ if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, BATADV_TQ_LOCAL_WINDOW_SIZE, &orig_ifinfo->batman_seqno_reset, NULL)) { ret = BATADV_PROTECTED; goto out; } rcu_read_lock(); hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) continue; neigh_addr = neigh_node->addr; is_dup = batadv_test_bit(neigh_ifinfo->bat_iv.real_bits, orig_ifinfo->last_real_seqno, seqno); if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && neigh_node->if_incoming == if_incoming) { set_mark = 1; if (is_dup) ret = BATADV_NEIGH_DUP; } else { set_mark = 0; if (is_dup && ret != BATADV_NEIGH_DUP) ret = BATADV_ORIG_DUP; } /* if the window moved, set the update flag. */ bitmap = neigh_ifinfo->bat_iv.real_bits; need_update |= batadv_bit_get_packet(bat_priv, bitmap, seq_diff, set_mark); packet_count = bitmap_weight(bitmap, BATADV_TQ_LOCAL_WINDOW_SIZE); neigh_ifinfo->bat_iv.real_packet_count = packet_count; batadv_neigh_ifinfo_put(neigh_ifinfo); } rcu_read_unlock(); if (need_update) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s updating last_seqno: old %u, new %u\n", if_outgoing ? if_outgoing->net_dev->name : "DEFAULT", orig_ifinfo->last_real_seqno, seqno); orig_ifinfo->last_real_seqno = seqno; } out: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_node_put(orig_node); batadv_orig_ifinfo_put(orig_ifinfo); return ret; } /** * batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing * interface * @skb: the skb containing the OGM * @ogm_offset: offset from skb->data to start of ogm header * @orig_node: the (cached) orig node for the originator of this OGM * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered */ static void batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_hardif_neigh_node *hardif_neigh = NULL; struct batadv_neigh_node *router = NULL; struct batadv_neigh_node *router_router = NULL; struct batadv_orig_node *orig_neigh_node; struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *orig_neigh_router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_ogm_packet *ogm_packet; enum batadv_dup_status dup_status; bool is_from_best_next_hop = false; bool is_single_hop_neigh = false; bool sameseq, similar_ttl; struct sk_buff *skb_priv; struct ethhdr *ethhdr; u8 *prev_sender; bool is_bidirect; /* create a private copy of the skb, as some functions change tq value * and/or flags. */ skb_priv = skb_copy(skb, GFP_ATOMIC); if (!skb_priv) return; ethhdr = eth_hdr(skb_priv); ogm_packet = (struct batadv_ogm_packet *)(skb_priv->data + ogm_offset); dup_status = batadv_iv_ogm_update_seqnos(ethhdr, ogm_packet, if_incoming, if_outgoing); if (batadv_compare_eth(ethhdr->h_source, ogm_packet->orig)) is_single_hop_neigh = true; if (dup_status == BATADV_PROTECTED) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: packet within seqno protection time (sender: %pM)\n", ethhdr->h_source); goto out; } if (ogm_packet->tq == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet with tq equal 0\n"); goto out; } if (is_single_hop_neigh) { hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source); if (hardif_neigh) hardif_neigh->last_seen = jiffies; } router = batadv_orig_router_get(orig_node, if_outgoing); if (router) { router_router = batadv_orig_router_get(router->orig_node, if_outgoing); router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); } if ((router_ifinfo && router_ifinfo->bat_iv.tq_avg != 0) && (batadv_compare_eth(router->addr, ethhdr->h_source))) is_from_best_next_hop = true; prev_sender = ogm_packet->prev_sender; /* avoid temporary routing loops */ if (router && router_router && (batadv_compare_eth(router->addr, prev_sender)) && !(batadv_compare_eth(ogm_packet->orig, prev_sender)) && (batadv_compare_eth(router->addr, router_router->addr))) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all rebroadcast packets that may make me loop (sender: %pM)\n", ethhdr->h_source); goto out; } if (if_outgoing == BATADV_IF_DEFAULT) batadv_tvlv_ogm_receive(bat_priv, ogm_packet, orig_node); /* if sender is a direct neighbor the sender mac equals * originator mac */ if (is_single_hop_neigh) orig_neigh_node = orig_node; else orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) goto out; /* Update nc_nodes of the originator */ batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node, ogm_packet, is_single_hop_neigh); orig_neigh_router = batadv_orig_router_get(orig_neigh_node, if_outgoing); /* drop packet if sender is not a direct neighbor and if we * don't route towards it */ if (!is_single_hop_neigh && !orig_neigh_router) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out_neigh; } is_bidirect = batadv_iv_ogm_calc_tq(orig_node, orig_neigh_node, ogm_packet, if_incoming, if_outgoing); /* update ranking if it is not a duplicate or has the same * seqno and similar ttl as the non-duplicate */ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out_neigh; sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno); similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl; if (is_bidirect && (dup_status == BATADV_NO_DUP || (sameseq && similar_ttl))) { batadv_iv_ogm_orig_update(bat_priv, orig_node, orig_ifinfo, ethhdr, ogm_packet, if_incoming, if_outgoing, dup_status); } batadv_orig_ifinfo_put(orig_ifinfo); /* only forward for specific interface, not for the default one. */ if (if_outgoing == BATADV_IF_DEFAULT) goto out_neigh; /* is single hop (direct) neighbor */ if (is_single_hop_neigh) { /* OGMs from secondary interfaces should only scheduled once * per interface where it has been received, not multiple times */ if (ogm_packet->ttl <= 2 && if_incoming != if_outgoing) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM from secondary interface and wrong outgoing interface\n"); goto out_neigh; } /* mark direct link on incoming interface */ batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet, is_single_hop_neigh, is_from_best_next_hop, if_incoming, if_outgoing); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: rebroadcast neighbor packet with direct link flag\n"); goto out_neigh; } /* multihop originator */ if (!is_bidirect) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: not received via bidirectional link\n"); goto out_neigh; } if (dup_status == BATADV_NEIGH_DUP) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: duplicate packet received\n"); goto out_neigh; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: rebroadcast originator packet\n"); batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet, is_single_hop_neigh, is_from_best_next_hop, if_incoming, if_outgoing); out_neigh: if (orig_neigh_node && !is_single_hop_neigh) batadv_orig_node_put(orig_neigh_node); out: batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); batadv_neigh_node_put(router_router); batadv_neigh_node_put(orig_neigh_router); batadv_hardif_neigh_put(hardif_neigh); consume_skb(skb_priv); } /** * batadv_iv_ogm_process_reply() - Check OGM for direct reply and process it * @ogm_packet: rebroadcast OGM packet to process * @if_incoming: the interface where this packet was received * @orig_node: originator which reproadcasted the OGMs * @if_incoming_seqno: OGM sequence number when rebroadcast was received */ static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_orig_node *orig_node, u32 if_incoming_seqno) { struct batadv_orig_ifinfo *orig_ifinfo; s32 bit_pos; u8 *weight; /* neighbor has to indicate direct link and it has to * come via the corresponding interface */ if (!(ogm_packet->flags & BATADV_DIRECTLINK)) return; if (!batadv_compare_eth(if_incoming->net_dev->dev_addr, ogm_packet->orig)) return; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_incoming); if (!orig_ifinfo) return; /* save packet seqno for bidirectional check */ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); bit_pos = if_incoming_seqno - 2; bit_pos -= ntohl(ogm_packet->seqno); batadv_set_bit(orig_ifinfo->bat_iv.bcast_own, bit_pos); weight = &orig_ifinfo->bat_iv.bcast_own_sum; *weight = bitmap_weight(orig_ifinfo->bat_iv.bcast_own, BATADV_TQ_LOCAL_WINDOW_SIZE); spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_ifinfo_put(orig_ifinfo); } /** * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) * @if_incoming: the interface where this packet was received */ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_orig_node *orig_neigh_node, *orig_node; struct batadv_hard_iface *hard_iface; struct batadv_ogm_packet *ogm_packet; u32 if_incoming_seqno; bool has_directlink_flag; struct ethhdr *ethhdr; bool is_my_oldorig = false; bool is_my_addr = false; bool is_my_orig = false; ogm_packet = (struct batadv_ogm_packet *)(skb->data + ogm_offset); ethhdr = eth_hdr(skb); /* Silently drop when the batman packet is actually not a * correct packet. * * This might happen if a packet is padded (e.g. Ethernet has a * minimum frame length of 64 byte) and the aggregation interprets * it as an additional length. * * TODO: A more sane solution would be to have a bit in the * batadv_ogm_packet to detect whether the packet is the last * packet in an aggregation. Here we expect that the padding * is always zero (or not 0x01) */ if (ogm_packet->packet_type != BATADV_IV_OGM) return; /* could be changed by schedule_own_packet() */ if_incoming_seqno = atomic_read(&if_incoming->bat_iv.ogm_seqno); if (ogm_packet->flags & BATADV_DIRECTLINK) has_directlink_flag = true; else has_directlink_flag = false; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Received BATMAN packet via NB: %pM, IF: %s [%pM] (from OG: %pM, via prev OG: %pM, seqno %u, tq %d, TTL %d, V %d, IDF %d)\n", ethhdr->h_source, if_incoming->net_dev->name, if_incoming->net_dev->dev_addr, ogm_packet->orig, ogm_packet->prev_sender, ntohl(ogm_packet->seqno), ogm_packet->tq, ogm_packet->ttl, ogm_packet->version, has_directlink_flag); rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->soft_iface != if_incoming->soft_iface) continue; if (batadv_compare_eth(ethhdr->h_source, hard_iface->net_dev->dev_addr)) is_my_addr = true; if (batadv_compare_eth(ogm_packet->orig, hard_iface->net_dev->dev_addr)) is_my_orig = true; if (batadv_compare_eth(ogm_packet->prev_sender, hard_iface->net_dev->dev_addr)) is_my_oldorig = true; } rcu_read_unlock(); if (is_my_addr) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: received my own broadcast (sender: %pM)\n", ethhdr->h_source); return; } if (is_my_orig) { orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) return; batadv_iv_ogm_process_reply(ogm_packet, if_incoming, orig_neigh_node, if_incoming_seqno); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from myself (via neighbor)\n"); batadv_orig_node_put(orig_neigh_node); return; } if (is_my_oldorig) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all rebroadcast echos (sender: %pM)\n", ethhdr->h_source); return; } if (ogm_packet->flags & BATADV_NOT_BEST_NEXT_HOP) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all packets not forwarded from the best next hop (sender: %pM)\n", ethhdr->h_source); return; } orig_node = batadv_iv_ogm_orig_get(bat_priv, ogm_packet->orig); if (!orig_node) return; batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, if_incoming, BATADV_IF_DEFAULT); rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, if_incoming, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); batadv_orig_node_put(orig_node); } static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_forw_packet *forw_packet; struct batadv_priv *bat_priv; bool dropped = false; delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) { dropped = true; goto out; } batadv_iv_ogm_emit(forw_packet); /* we have to have at least one packet in the queue to determine the * queues wake up time unless we are shutting down. * * only re-schedule if this is the "original" copy, e.g. the OGM of the * primary interface should only be rescheduled once per period, but * this function will be called for the forw_packet instances of the * other secondary interfaces as well. */ if (forw_packet->own && forw_packet->if_incoming == forw_packet->if_outgoing) batadv_iv_ogm_schedule(forw_packet->if_incoming); out: /* do we get something for free()? */ if (batadv_forw_packet_steal(forw_packet, &bat_priv->forw_bat_list_lock)) batadv_forw_packet_free(forw_packet, dropped); } static int batadv_iv_ogm_receive(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_ogm_packet *ogm_packet; u8 *packet_pos; int ogm_offset; bool res; int ret = NET_RX_DROP; res = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN); if (!res) goto free_skb; /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface * that does not have B.A.T.M.A.N. IV enabled ? */ if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable) goto free_skb; batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, skb->len + ETH_HLEN); ogm_offset = 0; ogm_packet = (struct batadv_ogm_packet *)skb->data; /* unpack the aggregated packets and process them one by one */ while (batadv_iv_ogm_aggr_packet(ogm_offset, skb_headlen(skb), ogm_packet)) { batadv_iv_ogm_process(skb, ogm_offset, if_incoming); ogm_offset += BATADV_OGM_HLEN; ogm_offset += ntohs(ogm_packet->tvlv_len); packet_pos = skb->data + ogm_offset; ogm_packet = (struct batadv_ogm_packet *)packet_pos; } ret = NET_RX_SUCCESS; free_skb: if (ret == NET_RX_SUCCESS) consume_skb(skb); else kfree_skb(skb); return ret; } /** * batadv_iv_ogm_neigh_get_tq_avg() - Get the TQ average for a neighbour on a * given outgoing interface. * @neigh_node: Neighbour of interest * @if_outgoing: Outgoing interface of interest * @tq_avg: Pointer of where to store the TQ average * * Return: False if no average TQ available, otherwise true. */ static bool batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_outgoing, u8 *tq_avg) { struct batadv_neigh_ifinfo *n_ifinfo; n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!n_ifinfo) return false; *tq_avg = n_ifinfo->bat_iv.tq_avg; batadv_neigh_ifinfo_put(n_ifinfo); return true; } /** * batadv_iv_ogm_orig_dump_subentry() - Dump an originator subentry into a * message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @neigh_node: Single hops neighbour * @best: Is the best originator * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, bool best) { void *hdr; u8 tq_avg; unsigned int last_seen_msecs; last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node, if_outgoing, &tq_avg)) return 0; if (if_outgoing != BATADV_IF_DEFAULT && if_outgoing != neigh_node->if_incoming) return 0; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_iv_ogm_orig_dump_entry() - Dump an originator entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @sub_s: Number of sub entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, int *sub_s) { struct batadv_neigh_node *neigh_node_best; struct batadv_neigh_node *neigh_node; int sub = 0; bool best; u8 tq_avg_best; neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); if (!neigh_node_best) goto out; if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node_best, if_outgoing, &tq_avg_best)) goto out; if (tq_avg_best == 0) goto out; hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { if (sub++ < *sub_s) continue; best = (neigh_node == neigh_node_best); if (batadv_iv_ogm_orig_dump_subentry(msg, portid, seq, bat_priv, if_outgoing, orig_node, neigh_node, best)) { batadv_neigh_node_put(neigh_node_best); *sub_s = sub - 1; return -EMSGSIZE; } } out: batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; } /** * batadv_iv_ogm_orig_dump_bucket() - Dump an originator bucket into a * message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @head: Bucket to be dumped * @idx_s: Number of entries to be skipped * @sub: Number of sub entries to be skipped * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct hlist_head *head, int *idx_s, int *sub) { struct batadv_orig_node *orig_node; int idx = 0; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (idx++ < *idx_s) continue; if (batadv_iv_ogm_orig_dump_entry(msg, portid, seq, bat_priv, if_outgoing, orig_node, sub)) { rcu_read_unlock(); *idx_s = idx - 1; return -EMSGSIZE; } } rcu_read_unlock(); *idx_s = 0; *sub = 0; return 0; } /** * batadv_iv_ogm_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface */ static void batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; while (bucket < hash->size) { head = &hash->table[bucket]; if (batadv_iv_ogm_orig_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, bat_priv, if_outgoing, head, &idx, &sub)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; cb->args[2] = sub; } /** * batadv_iv_ogm_neigh_diff() - calculate tq difference of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * @diff: pointer to integer receiving the calculated difference * * The content of *@diff is only valid when this function returns true. * It is less, equal to or greater than 0 if the metric via neigh1 is lower, * the same as or higher than the metric via neigh2 * * Return: true when the difference could be calculated, false otherwise */ static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2, int *diff) { struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; u8 tq1, tq2; bool ret = true; neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!neigh1_ifinfo || !neigh2_ifinfo) { ret = false; goto out; } tq1 = neigh1_ifinfo->bat_iv.tq_avg; tq2 = neigh2_ifinfo->bat_iv.tq_avg; *diff = (int)tq1 - (int)tq2; out: batadv_neigh_ifinfo_put(neigh1_ifinfo); batadv_neigh_ifinfo_put(neigh2_ifinfo); return ret; } /** * batadv_iv_ogm_neigh_dump_neigh() - Dump a neighbour into a netlink message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @hardif_neigh: Neighbour to be dumped * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_hardif_neigh_node *hardif_neigh) { void *hdr; unsigned int last_seen_msecs; last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_iv_ogm_neigh_dump_hardif() - Dump the neighbours of a hard interface * into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @hard_iface: Hard interface to dump the neighbours for * @idx_s: Number of entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface, int *idx_s) { struct batadv_hardif_neigh_node *hardif_neigh; int idx = 0; hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) { if (idx++ < *idx_s) continue; if (batadv_iv_ogm_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) { *idx_s = idx - 1; return -EMSGSIZE; } } *idx_s = 0; return 0; } /** * batadv_iv_ogm_neigh_dump() - Dump the neighbours into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @single_hardif: Limit dump to this hard interface */ static void batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *single_hardif) { struct batadv_hard_iface *hard_iface; int i_hardif = 0; int i_hardif_s = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; rcu_read_lock(); if (single_hardif) { if (i_hardif_s == 0) { if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, single_hardif, &idx) == 0) i_hardif++; } } else { list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (i_hardif++ < i_hardif_s) continue; if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, hard_iface, &idx)) { i_hardif--; break; } } } rcu_read_unlock(); cb->args[0] = i_hardif; cb->args[1] = idx; } /** * batadv_iv_ogm_neigh_cmp() - compare the metrics of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * * Return: a value less, equal to or greater than 0 if the metric via neigh1 is * lower, the same as or higher than the metric via neigh2 */ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { bool ret; int diff; ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, if_outgoing2, &diff); if (!ret) return 0; return diff; } /** * batadv_iv_ogm_neigh_is_sob() - check if neigh1 is similarly good or better * than neigh2 from the metric prospective * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * * Return: true if the metric via neigh1 is equally good or better than * the metric via neigh2, false otherwise. */ static bool batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { bool ret; int diff; ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, if_outgoing2, &diff); if (!ret) return false; ret = diff > -BATADV_TQ_SIMILARITY_THRESHOLD; return ret; } static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface) { /* begin scheduling originator messages on that interface */ batadv_iv_ogm_schedule(hard_iface); } /** * batadv_iv_init_sel_class() - initialize GW selection class * @bat_priv: the bat priv with all the soft interface information */ static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv) { /* set default TQ difference threshold to 20 */ atomic_set(&bat_priv->gw.sel_class, 20); } static struct batadv_gw_node * batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) { struct batadv_neigh_node *router; struct batadv_neigh_ifinfo *router_ifinfo; struct batadv_gw_node *gw_node, *curr_gw = NULL; u64 max_gw_factor = 0; u64 tmp_gw_factor = 0; u8 max_tq = 0; u8 tq_avg; struct batadv_orig_node *orig_node; rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { orig_node = gw_node->orig_node; router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) continue; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto next; if (!kref_get_unless_zero(&gw_node->refcount)) goto next; tq_avg = router_ifinfo->bat_iv.tq_avg; switch (atomic_read(&bat_priv->gw.sel_class)) { case 1: /* fast connection */ tmp_gw_factor = tq_avg * tq_avg; tmp_gw_factor *= gw_node->bandwidth_down; tmp_gw_factor *= 100 * 100; tmp_gw_factor >>= 18; if (tmp_gw_factor > max_gw_factor || (tmp_gw_factor == max_gw_factor && tq_avg > max_tq)) { batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } break; default: /* 2: stable connection (use best statistic) * 3: fast-switch (use best statistic but change as * soon as a better gateway appears) * XX: late-switch (use best statistic but change as * soon as a better gateway appears which has * $routing_class more tq points) */ if (tq_avg > max_tq) { batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } break; } if (tq_avg > max_tq) max_tq = tq_avg; if (tmp_gw_factor > max_gw_factor) max_gw_factor = tmp_gw_factor; batadv_gw_node_put(gw_node); next: batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(router_ifinfo); } rcu_read_unlock(); return curr_gw; } static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv, struct batadv_orig_node *curr_gw_orig, struct batadv_orig_node *orig_node) { struct batadv_neigh_ifinfo *router_orig_ifinfo = NULL; struct batadv_neigh_ifinfo *router_gw_ifinfo = NULL; struct batadv_neigh_node *router_gw = NULL; struct batadv_neigh_node *router_orig = NULL; u8 gw_tq_avg, orig_tq_avg; bool ret = false; /* dynamic re-election is performed only on fast or late switch */ if (atomic_read(&bat_priv->gw.sel_class) <= 2) return false; router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT); if (!router_gw) { ret = true; goto out; } router_gw_ifinfo = batadv_neigh_ifinfo_get(router_gw, BATADV_IF_DEFAULT); if (!router_gw_ifinfo) { ret = true; goto out; } router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router_orig) goto out; router_orig_ifinfo = batadv_neigh_ifinfo_get(router_orig, BATADV_IF_DEFAULT); if (!router_orig_ifinfo) goto out; gw_tq_avg = router_gw_ifinfo->bat_iv.tq_avg; orig_tq_avg = router_orig_ifinfo->bat_iv.tq_avg; /* the TQ value has to be better */ if (orig_tq_avg < gw_tq_avg) goto out; /* if the routing class is greater than 3 the value tells us how much * greater the TQ value of the new gateway must be */ if ((atomic_read(&bat_priv->gw.sel_class) > 3) && (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class))) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n", gw_tq_avg, orig_tq_avg); ret = true; out: batadv_neigh_ifinfo_put(router_gw_ifinfo); batadv_neigh_ifinfo_put(router_orig_ifinfo); batadv_neigh_node_put(router_gw); batadv_neigh_node_put(router_orig); return ret; } /** * batadv_iv_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); ret = -EMSGSIZE; if (curr_gw == gw_node) if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, gw_node->orig_node->orig) || nla_put_u8(msg, BATADV_ATTR_TQ, router_ifinfo->bat_iv.tq_avg) || nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, router->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, router->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: batadv_gw_node_put(curr_gw); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); return ret; } /** * batadv_iv_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information */ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_gw_node *gw_node; int idx_skip = cb->args[0]; int idx = 0; spin_lock_bh(&bat_priv->gw.list_lock); cb->seq = bat_priv->gw.generation << 1 | 1; hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; if (batadv_iv_gw_dump_entry(msg, portid, cb, bat_priv, gw_node)) { idx_skip = idx - 1; goto unlock; } } idx_skip = idx; unlock: spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .name = "BATMAN_IV", .iface = { .enable = batadv_iv_ogm_iface_enable, .enabled = batadv_iv_iface_enabled, .disable = batadv_iv_ogm_iface_disable, .update_mac = batadv_iv_ogm_iface_update_mac, .primary_set = batadv_iv_ogm_primary_iface_set, }, .neigh = { .cmp = batadv_iv_ogm_neigh_cmp, .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, .dump = batadv_iv_ogm_neigh_dump, }, .orig = { .dump = batadv_iv_ogm_orig_dump, }, .gw = { .init_sel_class = batadv_iv_init_sel_class, .sel_class_max = BATADV_TQ_MAX_VALUE, .get_best_gw_node = batadv_iv_gw_get_best_gw_node, .is_eligible = batadv_iv_gw_is_eligible, .dump = batadv_iv_gw_dump, }, }; /** * batadv_iv_init() - B.A.T.M.A.N. IV initialization function * * Return: 0 on success or negative error number in case of failure */ int __init batadv_iv_init(void) { int ret; /* batman originator packet */ ret = batadv_recv_handler_register(BATADV_IV_OGM, batadv_iv_ogm_receive); if (ret < 0) goto out; ret = batadv_algo_register(&batadv_batman_iv); if (ret < 0) goto handler_unregister; goto out; handler_unregister: batadv_recv_handler_unregister(BATADV_IV_OGM); out: return ret; }
2 735 87 342 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM timestamp #if !defined(_TRACE_TIMESTAMP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TIMESTAMP_H #include <linux/tracepoint.h> #include <linux/fs.h> #define CTIME_QUERIED_FLAGS \ { I_CTIME_QUERIED, "Q" } DECLARE_EVENT_CLASS(ctime, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(time64_t, ctime_s) __field(u32, ctime_ns) __field(u32, gen) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->ctime_s = ctime->tv_sec; __entry->ctime_ns = ctime->tv_nsec; ), TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->ctime_s, __entry->ctime_ns ) ); DEFINE_EVENT(ctime, inode_set_ctime_to_ts, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime)); DEFINE_EVENT(ctime, ctime_xchg_skip, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime)); TRACE_EVENT(ctime_ns_xchg, TP_PROTO(struct inode *inode, u32 old, u32 new, u32 cur), TP_ARGS(inode, old, new, cur), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(u32, gen) __field(u32, old) __field(u32, new) __field(u32, cur) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->old = old; __entry->new = new; __entry->cur = cur; ), TP_printk("ino=%d:%d:%ld:%u old=%u:%s new=%u cur=%u:%s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->old & ~I_CTIME_QUERIED, __print_flags(__entry->old & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS), __entry->new, __entry->cur & ~I_CTIME_QUERIED, __print_flags(__entry->cur & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS) ) ); TRACE_EVENT(fill_mg_cmtime, TP_PROTO(struct inode *inode, struct timespec64 *ctime, struct timespec64 *mtime), TP_ARGS(inode, ctime, mtime), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(time64_t, ctime_s) __field(time64_t, mtime_s) __field(u32, ctime_ns) __field(u32, mtime_ns) __field(u32, gen) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->ctime_s = ctime->tv_sec; __entry->mtime_s = mtime->tv_sec; __entry->ctime_ns = ctime->tv_nsec; __entry->mtime_ns = mtime->tv_nsec; ), TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u mtime=%lld.%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->ctime_s, __entry->ctime_ns, __entry->mtime_s, __entry->mtime_ns ) ); #endif /* _TRACE_TIMESTAMP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
7 628 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 // SPDX-License-Identifier: GPL-2.0 /* * trace_export.c - export basic ftrace utilities to user space * * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com> */ #include <linux/stringify.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/module.h> #include <linux/init.h> #include "trace_output.h" /* Stub function for events with triggers */ static int ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data) { return 0; } #undef TRACE_SYSTEM #define TRACE_SYSTEM ftrace /* * The FTRACE_ENTRY_REG macro allows ftrace entry to define register * function and thus become accessible via perf. */ #undef FTRACE_ENTRY_REG #define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) /* not needed for this file */ #undef __field_struct #define __field_struct(type, item) #undef __field #define __field(type, item) type item; #undef __field_fn #define __field_fn(type, item) type item; #undef __field_desc #define __field_desc(type, container, item) type item; #undef __field_packed #define __field_packed(type, container, item) type item; #undef __array #define __array(type, item, size) type item[size]; #undef __stack_array #define __stack_array(type, item, size, field) __array(type, item, size) #undef __array_desc #define __array_desc(type, container, item, size) type item[size]; #undef __dynamic_array #define __dynamic_array(type, item) type item[]; #undef F_STRUCT #define F_STRUCT(args...) args #undef F_printk #define F_printk(fmt, args...) fmt, args #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ struct ____ftrace_##name { \ tstruct \ }; \ static void __always_unused ____ftrace_check_##name(void) \ { \ struct ____ftrace_##name *__entry = NULL; \ \ /* force compile-time check on F_printk() */ \ printk(print); \ } #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" #undef __field_ext #define __field_ext(_type, _item, _filter_type) { \ .type = #_type, .name = #_item, \ .size = sizeof(_type), .align = __alignof__(_type), \ is_signed_type(_type), .filter_type = _filter_type }, #undef __field_ext_packed #define __field_ext_packed(_type, _item, _filter_type) { \ .type = #_type, .name = #_item, \ .size = sizeof(_type), .align = 1, \ is_signed_type(_type), .filter_type = _filter_type }, #undef __field #define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER) #undef __field_fn #define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN) #undef __field_desc #define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER) #undef __field_packed #define __field_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER) #undef __array #define __array(_type, _item, _len) { \ .type = #_type"["__stringify(_len)"]", .name = #_item, \ .size = sizeof(_type[_len]), .align = __alignof__(_type), \ is_signed_type(_type), .filter_type = FILTER_OTHER, \ .len = _len }, #undef __stack_array #define __stack_array(_type, _item, _len, _field) __array(_type, _item, _len) #undef __array_desc #define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) #undef __dynamic_array #define __dynamic_array(_type, _item) { \ .type = #_type "[]", .name = #_item, \ .size = 0, .align = __alignof__(_type), \ is_signed_type(_type), .filter_type = FILTER_OTHER }, #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ static struct trace_event_fields ftrace_event_fields_##name[] = { \ tstruct \ {} }; #include "trace_entries.h" #undef __entry #define __entry REC #undef __field #define __field(type, item) #undef __field_fn #define __field_fn(type, item) #undef __field_desc #define __field_desc(type, container, item) #undef __field_packed #define __field_packed(type, container, item) #undef __array #define __array(type, item, len) #undef __stack_array #define __stack_array(type, item, len, field) #undef __array_desc #define __array_desc(type, container, item, len) #undef __dynamic_array #define __dynamic_array(type, item) #undef F_printk #define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) #undef FTRACE_ENTRY_REG #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn) \ static struct trace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .fields_array = ftrace_event_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ .reg = regfn, \ }; \ \ struct trace_event_call __used event_##call = { \ .class = &event_class_ftrace_##call, \ { \ .name = #call, \ }, \ .event.type = etype, \ .print_fmt = print, \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ static struct trace_event_call __used \ __section("_ftrace_events") *__event_##call = &event_##call; #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ FTRACE_ENTRY_REG(call, struct_name, etype, \ PARAMS(tstruct), PARAMS(print), NULL) bool ftrace_event_is_function(struct trace_event_call *call) { return call == &event_function; } #include "trace_entries.h"
567 566 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 // SPDX-License-Identifier: GPL-2.0 /* * property.c - Unified device property interface. * * Copyright (C) 2014, Intel Corporation * Authors: Rafael J. Wysocki <rafael.j.wysocki@intel.com> * Mika Westerberg <mika.westerberg@linux.intel.com> */ #include <linux/device.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kconfig.h> #include <linux/of.h> #include <linux/property.h> #include <linux/phy.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> struct fwnode_handle *__dev_fwnode(struct device *dev) { return IS_ENABLED(CONFIG_OF) && dev->of_node ? of_fwnode_handle(dev->of_node) : dev->fwnode; } EXPORT_SYMBOL_GPL(__dev_fwnode); const struct fwnode_handle *__dev_fwnode_const(const struct device *dev) { return IS_ENABLED(CONFIG_OF) && dev->of_node ? of_fwnode_handle(dev->of_node) : dev->fwnode; } EXPORT_SYMBOL_GPL(__dev_fwnode_const); /** * device_property_present - check if a property of a device is present * @dev: Device whose property is being checked * @propname: Name of the property * * Check if property @propname is present in the device firmware description. * * Return: true if property @propname is present. Otherwise, returns false. */ bool device_property_present(const struct device *dev, const char *propname) { return fwnode_property_present(dev_fwnode(dev), propname); } EXPORT_SYMBOL_GPL(device_property_present); /** * fwnode_property_present - check if a property of a firmware node is present * @fwnode: Firmware node whose property to check * @propname: Name of the property * * Return: true if property @propname is present. Otherwise, returns false. */ bool fwnode_property_present(const struct fwnode_handle *fwnode, const char *propname) { bool ret; if (IS_ERR_OR_NULL(fwnode)) return false; ret = fwnode_call_bool_op(fwnode, property_present, propname); if (ret) return ret; return fwnode_call_bool_op(fwnode->secondary, property_present, propname); } EXPORT_SYMBOL_GPL(fwnode_property_present); /** * device_property_read_u8_array - return a u8 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u8 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u8() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u8_array(const struct device *dev, const char *propname, u8 *val, size_t nval) { return fwnode_property_read_u8_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u8_array); /** * device_property_read_u16_array - return a u16 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u16 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u16() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u16_array(const struct device *dev, const char *propname, u16 *val, size_t nval) { return fwnode_property_read_u16_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u16_array); /** * device_property_read_u32_array - return a u32 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u32 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u32() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u32_array(const struct device *dev, const char *propname, u32 *val, size_t nval) { return fwnode_property_read_u32_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u32_array); /** * device_property_read_u64_array - return a u64 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u64 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u64() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u64_array(const struct device *dev, const char *propname, u64 *val, size_t nval) { return fwnode_property_read_u64_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u64_array); /** * device_property_read_string_array - return a string array property of device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of string properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_string_array_count() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values read on success if @val is non-NULL, * number of values available on success if @val is NULL, * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not an array of strings, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_string_array(const struct device *dev, const char *propname, const char **val, size_t nval) { return fwnode_property_read_string_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_string_array); /** * device_property_read_string - return a string property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The value is stored here * * Function reads property @propname from the device firmware description and * stores the value into @val if found. The value is checked to be a string. * * Return: %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property type is not a string. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_string(const struct device *dev, const char *propname, const char **val) { return fwnode_property_read_string(dev_fwnode(dev), propname, val); } EXPORT_SYMBOL_GPL(device_property_read_string); /** * device_property_match_string - find a string in an array and return index * @dev: Device to get the property of * @propname: Name of the property holding the array * @string: String to look for * * Find a given string in a string array and if it is found return the * index back. * * Return: index, starting from %0, if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of strings, * %-ENXIO if no suitable firmware interface is present. */ int device_property_match_string(const struct device *dev, const char *propname, const char *string) { return fwnode_property_match_string(dev_fwnode(dev), propname, string); } EXPORT_SYMBOL_GPL(device_property_match_string); static int fwnode_property_read_int_array(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, size_t nval) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -EINVAL; ret = fwnode_call_int_op(fwnode, property_read_int_array, propname, elem_size, val, nval); if (ret != -EINVAL) return ret; return fwnode_call_int_op(fwnode->secondary, property_read_int_array, propname, elem_size, val, nval); } /** * fwnode_property_read_u8_array - return a u8 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u8 properties with @propname from @fwnode and stores them to * @val if found. * * It's recommended to call fwnode_property_count_u8() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode, const char *propname, u8 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u8), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u8_array); /** * fwnode_property_read_u16_array - return a u16 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u16 properties with @propname from @fwnode and store them to * @val if found. * * It's recommended to call fwnode_property_count_u16() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u16_array(const struct fwnode_handle *fwnode, const char *propname, u16 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u16), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u16_array); /** * fwnode_property_read_u32_array - return a u32 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u32 properties with @propname from @fwnode store them to * @val if found. * * It's recommended to call fwnode_property_count_u32() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u32_array(const struct fwnode_handle *fwnode, const char *propname, u32 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u32), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u32_array); /** * fwnode_property_read_u64_array - return a u64 array property firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u64 properties with @propname from @fwnode and store them to * @val if found. * * It's recommended to call fwnode_property_count_u64() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u64_array(const struct fwnode_handle *fwnode, const char *propname, u64 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u64), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u64_array); /** * fwnode_property_read_string_array - return string array property of a node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an string list property @propname from the given firmware node and store * them to @val if found. * * It's recommended to call fwnode_property_string_array_count() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values read on success if @val is non-NULL, * number of values available on success if @val is NULL, * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not an array of strings, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_string_array(const struct fwnode_handle *fwnode, const char *propname, const char **val, size_t nval) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -EINVAL; ret = fwnode_call_int_op(fwnode, property_read_string_array, propname, val, nval); if (ret != -EINVAL) return ret; return fwnode_call_int_op(fwnode->secondary, property_read_string_array, propname, val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_string_array); /** * fwnode_property_read_string - return a string property of a firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The value is stored here * * Read property @propname from the given firmware node and store the value into * @val if found. The value is checked to be a string. * * Return: %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not a string, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_string(const struct fwnode_handle *fwnode, const char *propname, const char **val) { int ret = fwnode_property_read_string_array(fwnode, propname, val, 1); return ret < 0 ? ret : 0; } EXPORT_SYMBOL_GPL(fwnode_property_read_string); /** * fwnode_property_match_string - find a string in an array and return index * @fwnode: Firmware node to get the property of * @propname: Name of the property holding the array * @string: String to look for * * Find a given string in a string array and if it is found return the * index back. * * Return: index, starting from %0, if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of strings, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_match_string(const struct fwnode_handle *fwnode, const char *propname, const char *string) { const char **values; int nval, ret; nval = fwnode_property_string_array_count(fwnode, propname); if (nval < 0) return nval; if (nval == 0) return -ENODATA; values = kcalloc(nval, sizeof(*values), GFP_KERNEL); if (!values) return -ENOMEM; ret = fwnode_property_read_string_array(fwnode, propname, values, nval); if (ret < 0) goto out_free; ret = match_string(values, nval, string); if (ret < 0) ret = -ENODATA; out_free: kfree(values); return ret; } EXPORT_SYMBOL_GPL(fwnode_property_match_string); /** * fwnode_property_match_property_string - find a property string value in an array and return index * @fwnode: Firmware node to get the property of * @propname: Name of the property holding the string value * @array: String array to search in * @n: Size of the @array * * Find a property string value in a given @array and if it is found return * the index back. * * Return: index, starting from %0, if the string value was found in the @array (success), * %-ENOENT when the string value was not found in the @array, * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not a string, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_match_property_string(const struct fwnode_handle *fwnode, const char *propname, const char * const *array, size_t n) { const char *string; int ret; ret = fwnode_property_read_string(fwnode, propname, &string); if (ret) return ret; ret = match_string(array, n, string); if (ret < 0) ret = -ENOENT; return ret; } EXPORT_SYMBOL_GPL(fwnode_property_match_property_string); /** * fwnode_property_get_reference_args() - Find a reference with arguments * @fwnode: Firmware node where to look for the reference * @prop: The name of the property * @nargs_prop: The name of the property telling the number of * arguments in the referred node. NULL if @nargs is known, * otherwise @nargs is ignored. Only relevant on OF. * @nargs: Number of arguments. Ignored if @nargs_prop is non-NULL. * @index: Index of the reference, from zero onwards. * @args: Result structure with reference and integer arguments. * May be NULL. * * Obtain a reference based on a named property in an fwnode, with * integer arguments. * * The caller is responsible for calling fwnode_handle_put() on the returned * @args->fwnode pointer. * * Return: %0 on success * %-ENOENT when the index is out of bounds, the index has an empty * reference or the property was not found * %-EINVAL on parse error */ int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode, const char *prop, const char *nargs_prop, unsigned int nargs, unsigned int index, struct fwnode_reference_args *args) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -ENOENT; ret = fwnode_call_int_op(fwnode, get_reference_args, prop, nargs_prop, nargs, index, args); if (ret == 0) return ret; if (IS_ERR_OR_NULL(fwnode->secondary)) return ret; return fwnode_call_int_op(fwnode->secondary, get_reference_args, prop, nargs_prop, nargs, index, args); } EXPORT_SYMBOL_GPL(fwnode_property_get_reference_args); /** * fwnode_find_reference - Find named reference to a fwnode_handle * @fwnode: Firmware node where to look for the reference * @name: The name of the reference * @index: Index of the reference * * @index can be used when the named reference holds a table of references. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: a pointer to the reference fwnode, when found. Otherwise, * returns an error pointer. */ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode, const char *name, unsigned int index) { struct fwnode_reference_args args; int ret; ret = fwnode_property_get_reference_args(fwnode, name, NULL, 0, index, &args); return ret ? ERR_PTR(ret) : args.fwnode; } EXPORT_SYMBOL_GPL(fwnode_find_reference); /** * fwnode_get_name - Return the name of a node * @fwnode: The firmware node * * Return: a pointer to the node name, or %NULL. */ const char *fwnode_get_name(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_name); } EXPORT_SYMBOL_GPL(fwnode_get_name); /** * fwnode_get_name_prefix - Return the prefix of node for printing purposes * @fwnode: The firmware node * * Return: the prefix of a node, intended to be printed right before the node. * The prefix works also as a separator between the nodes. */ const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_name_prefix); } /** * fwnode_name_eq - Return true if node name is equal * @fwnode: The firmware node * @name: The name to which to compare the node name * * Compare the name provided as an argument to the name of the node, stopping * the comparison at either NUL or '@' character, whichever comes first. This * function is generally used for comparing node names while ignoring the * possible unit address of the node. * * Return: true if the node name matches with the name provided in the @name * argument, false otherwise. */ bool fwnode_name_eq(const struct fwnode_handle *fwnode, const char *name) { const char *node_name; ptrdiff_t len; node_name = fwnode_get_name(fwnode); if (!node_name) return false; len = strchrnul(node_name, '@') - node_name; return str_has_prefix(node_name, name) == len; } EXPORT_SYMBOL_GPL(fwnode_name_eq); /** * fwnode_get_parent - Return parent firwmare node * @fwnode: Firmware whose parent is retrieved * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: parent firmware node of the given node if possible or %NULL if no * parent was available. */ struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_parent); } EXPORT_SYMBOL_GPL(fwnode_get_parent); /** * fwnode_get_next_parent - Iterate to the node's parent * @fwnode: Firmware whose parent is retrieved * * This is like fwnode_get_parent() except that it drops the refcount * on the passed node, making it suitable for iterating through a * node's parents. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @fwnode * unconditionally. * * Return: parent firmware node of the given node if possible or %NULL if no * parent was available. */ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode) { struct fwnode_handle *parent = fwnode_get_parent(fwnode); fwnode_handle_put(fwnode); return parent; } EXPORT_SYMBOL_GPL(fwnode_get_next_parent); /** * fwnode_count_parents - Return the number of parents a node has * @fwnode: The node the parents of which are to be counted * * Return: the number of parents a node has. */ unsigned int fwnode_count_parents(const struct fwnode_handle *fwnode) { struct fwnode_handle *parent; unsigned int count = 0; fwnode_for_each_parent_node(fwnode, parent) count++; return count; } EXPORT_SYMBOL_GPL(fwnode_count_parents); /** * fwnode_get_nth_parent - Return an nth parent of a node * @fwnode: The node the parent of which is requested * @depth: Distance of the parent from the node * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the nth parent of a node. If there is no parent at the requested * @depth, %NULL is returned. If @depth is 0, the functionality is equivalent to * fwnode_handle_get(). For @depth == 1, it is fwnode_get_parent() and so on. */ struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwnode, unsigned int depth) { struct fwnode_handle *parent; if (depth == 0) return fwnode_handle_get(fwnode); fwnode_for_each_parent_node(fwnode, parent) { if (--depth == 0) return parent; } return NULL; } EXPORT_SYMBOL_GPL(fwnode_get_nth_parent); /** * fwnode_get_next_child_node - Return the next child node handle for a node * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the node's child nodes or a %NULL handle. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @child * unconditionally. */ struct fwnode_handle * fwnode_get_next_child_node(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { return fwnode_call_ptr_op(fwnode, get_next_child_node, child); } EXPORT_SYMBOL_GPL(fwnode_get_next_child_node); /** * fwnode_get_next_available_child_node - Return the next available child node handle for a node * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the node's child nodes or a %NULL handle. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @child * unconditionally. */ struct fwnode_handle * fwnode_get_next_available_child_node(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { struct fwnode_handle *next_child = child; if (IS_ERR_OR_NULL(fwnode)) return NULL; do { next_child = fwnode_get_next_child_node(fwnode, next_child); if (!next_child) return NULL; } while (!fwnode_device_is_available(next_child)); return next_child; } EXPORT_SYMBOL_GPL(fwnode_get_next_available_child_node); /** * device_get_next_child_node - Return the next child node handle for a device * @dev: Device to find the next child node for. * @child: Handle to one of the device's child nodes or a %NULL handle. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @child * unconditionally. */ struct fwnode_handle *device_get_next_child_node(const struct device *dev, struct fwnode_handle *child) { const struct fwnode_handle *fwnode = dev_fwnode(dev); struct fwnode_handle *next; if (IS_ERR_OR_NULL(fwnode)) return NULL; /* Try to find a child in primary fwnode */ next = fwnode_get_next_child_node(fwnode, child); if (next) return next; /* When no more children in primary, continue with secondary */ return fwnode_get_next_child_node(fwnode->secondary, child); } EXPORT_SYMBOL_GPL(device_get_next_child_node); /** * fwnode_get_named_child_node - Return first matching named child node handle * @fwnode: Firmware node to find the named child node for. * @childname: String to match child node name against. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_get_named_child_node(const struct fwnode_handle *fwnode, const char *childname) { return fwnode_call_ptr_op(fwnode, get_named_child_node, childname); } EXPORT_SYMBOL_GPL(fwnode_get_named_child_node); /** * device_get_named_child_node - Return first matching named child node handle * @dev: Device to find the named child node for. * @childname: String to match child node name against. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle *device_get_named_child_node(const struct device *dev, const char *childname) { return fwnode_get_named_child_node(dev_fwnode(dev), childname); } EXPORT_SYMBOL_GPL(device_get_named_child_node); /** * fwnode_handle_get - Obtain a reference to a device node * @fwnode: Pointer to the device node to obtain the reference to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the fwnode handle. */ struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode) { if (!fwnode_has_op(fwnode, get)) return fwnode; return fwnode_call_ptr_op(fwnode, get); } EXPORT_SYMBOL_GPL(fwnode_handle_get); /** * fwnode_device_is_available - check if a device is available for use * @fwnode: Pointer to the fwnode of the device. * * Return: true if device is available for use. Otherwise, returns false. * * For fwnode node types that don't implement the .device_is_available() * operation, this function returns true. */ bool fwnode_device_is_available(const struct fwnode_handle *fwnode) { if (IS_ERR_OR_NULL(fwnode)) return false; if (!fwnode_has_op(fwnode, device_is_available)) return true; return fwnode_call_bool_op(fwnode, device_is_available); } EXPORT_SYMBOL_GPL(fwnode_device_is_available); /** * device_get_child_node_count - return the number of child nodes for device * @dev: Device to count the child nodes for * * Return: the number of child nodes for a given device. */ unsigned int device_get_child_node_count(const struct device *dev) { struct fwnode_handle *child; unsigned int count = 0; device_for_each_child_node(dev, child) count++; return count; } EXPORT_SYMBOL_GPL(device_get_child_node_count); bool device_dma_supported(const struct device *dev) { return fwnode_call_bool_op(dev_fwnode(dev), device_dma_supported); } EXPORT_SYMBOL_GPL(device_dma_supported); enum dev_dma_attr device_get_dma_attr(const struct device *dev) { if (!fwnode_has_op(dev_fwnode(dev), device_get_dma_attr)) return DEV_DMA_NOT_SUPPORTED; return fwnode_call_int_op(dev_fwnode(dev), device_get_dma_attr); } EXPORT_SYMBOL_GPL(device_get_dma_attr); /** * fwnode_get_phy_mode - Get phy mode for given firmware node * @fwnode: Pointer to the given node * * The function gets phy interface string from property 'phy-mode' or * 'phy-connection-type', and return its index in phy_modes table, or errno in * error case. */ int fwnode_get_phy_mode(const struct fwnode_handle *fwnode) { const char *pm; int err, i; err = fwnode_property_read_string(fwnode, "phy-mode", &pm); if (err < 0) err = fwnode_property_read_string(fwnode, "phy-connection-type", &pm); if (err < 0) return err; for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++) if (!strcasecmp(pm, phy_modes(i))) return i; return -ENODEV; } EXPORT_SYMBOL_GPL(fwnode_get_phy_mode); /** * device_get_phy_mode - Get phy mode for given device * @dev: Pointer to the given device * * The function gets phy interface string from property 'phy-mode' or * 'phy-connection-type', and return its index in phy_modes table, or errno in * error case. */ int device_get_phy_mode(struct device *dev) { return fwnode_get_phy_mode(dev_fwnode(dev)); } EXPORT_SYMBOL_GPL(device_get_phy_mode); /** * fwnode_iomap - Maps the memory mapped IO for a given fwnode * @fwnode: Pointer to the firmware node * @index: Index of the IO range * * Return: a pointer to the mapped memory. */ void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index) { return fwnode_call_ptr_op(fwnode, iomap, index); } EXPORT_SYMBOL(fwnode_iomap); /** * fwnode_irq_get - Get IRQ directly from a fwnode * @fwnode: Pointer to the firmware node * @index: Zero-based index of the IRQ * * Return: Linux IRQ number on success. Negative errno on failure. */ int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index) { int ret; ret = fwnode_call_int_op(fwnode, irq_get, index); /* We treat mapping errors as invalid case */ if (ret == 0) return -EINVAL; return ret; } EXPORT_SYMBOL(fwnode_irq_get); /** * fwnode_irq_get_byname - Get IRQ from a fwnode using its name * @fwnode: Pointer to the firmware node * @name: IRQ name * * Description: * Find a match to the string @name in the 'interrupt-names' string array * in _DSD for ACPI, or of_node for Device Tree. Then get the Linux IRQ * number of the IRQ resource corresponding to the index of the matched * string. * * Return: Linux IRQ number on success, or negative errno otherwise. */ int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name) { int index; if (!name) return -EINVAL; index = fwnode_property_match_string(fwnode, "interrupt-names", name); if (index < 0) return index; return fwnode_irq_get(fwnode, index); } EXPORT_SYMBOL(fwnode_irq_get_byname); /** * fwnode_graph_get_next_endpoint - Get next endpoint firmware node * @fwnode: Pointer to the parent firmware node * @prev: Previous endpoint node or %NULL to get the first * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @prev * unconditionally. * * Return: an endpoint firmware node pointer or %NULL if no more endpoints * are available. */ struct fwnode_handle * fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { struct fwnode_handle *ep, *port_parent = NULL; const struct fwnode_handle *parent; /* * If this function is in a loop and the previous iteration returned * an endpoint from fwnode->secondary, then we need to use the secondary * as parent rather than @fwnode. */ if (prev) { port_parent = fwnode_graph_get_port_parent(prev); parent = port_parent; } else { parent = fwnode; } if (IS_ERR_OR_NULL(parent)) return NULL; ep = fwnode_call_ptr_op(parent, graph_get_next_endpoint, prev); if (ep) goto out_put_port_parent; ep = fwnode_graph_get_next_endpoint(parent->secondary, NULL); out_put_port_parent: fwnode_handle_put(port_parent); return ep; } EXPORT_SYMBOL_GPL(fwnode_graph_get_next_endpoint); /** * fwnode_graph_get_port_parent - Return the device fwnode of a port endpoint * @endpoint: Endpoint firmware node of the port * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the firmware node of the device the @endpoint belongs to. */ struct fwnode_handle * fwnode_graph_get_port_parent(const struct fwnode_handle *endpoint) { struct fwnode_handle *port, *parent; port = fwnode_get_parent(endpoint); parent = fwnode_call_ptr_op(port, graph_get_port_parent); fwnode_handle_put(port); return parent; } EXPORT_SYMBOL_GPL(fwnode_graph_get_port_parent); /** * fwnode_graph_get_remote_port_parent - Return fwnode of a remote device * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote device the @fwnode points to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_graph_get_remote_port_parent(const struct fwnode_handle *fwnode) { struct fwnode_handle *endpoint, *parent; endpoint = fwnode_graph_get_remote_endpoint(fwnode); parent = fwnode_graph_get_port_parent(endpoint); fwnode_handle_put(endpoint); return parent; } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port_parent); /** * fwnode_graph_get_remote_port - Return fwnode of a remote port * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote port the @fwnode points to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_graph_get_remote_port(const struct fwnode_handle *fwnode) { return fwnode_get_next_parent(fwnode_graph_get_remote_endpoint(fwnode)); } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port); /** * fwnode_graph_get_remote_endpoint - Return fwnode of a remote endpoint * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote endpoint the @fwnode points to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, graph_get_remote_endpoint); } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint); static bool fwnode_graph_remote_available(struct fwnode_handle *ep) { struct fwnode_handle *dev_node; bool available; dev_node = fwnode_graph_get_remote_port_parent(ep); available = fwnode_device_is_available(dev_node); fwnode_handle_put(dev_node); return available; } /** * fwnode_graph_get_endpoint_by_id - get endpoint by port and endpoint numbers * @fwnode: parent fwnode_handle containing the graph * @port: identifier of the port node * @endpoint: identifier of the endpoint node under the port node * @flags: fwnode lookup flags * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the fwnode handle of the local endpoint corresponding the port and * endpoint IDs or %NULL if not found. * * If FWNODE_GRAPH_ENDPOINT_NEXT is passed in @flags and the specified endpoint * has not been found, look for the closest endpoint ID greater than the * specified one and return the endpoint that corresponds to it, if present. * * Does not return endpoints that belong to disabled devices or endpoints that * are unconnected, unless FWNODE_GRAPH_DEVICE_DISABLED is passed in @flags. */ struct fwnode_handle * fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode, u32 port, u32 endpoint, unsigned long flags) { struct fwnode_handle *ep, *best_ep = NULL; unsigned int best_ep_id = 0; bool endpoint_next = flags & FWNODE_GRAPH_ENDPOINT_NEXT; bool enabled_only = !(flags & FWNODE_GRAPH_DEVICE_DISABLED); fwnode_graph_for_each_endpoint(fwnode, ep) { struct fwnode_endpoint fwnode_ep = { 0 }; int ret; if (enabled_only && !fwnode_graph_remote_available(ep)) continue; ret = fwnode_graph_parse_endpoint(ep, &fwnode_ep); if (ret < 0) continue; if (fwnode_ep.port != port) continue; if (fwnode_ep.id == endpoint) return ep; if (!endpoint_next) continue; /* * If the endpoint that has just been found is not the first * matching one and the ID of the one found previously is closer * to the requested endpoint ID, skip it. */ if (fwnode_ep.id < endpoint || (best_ep && best_ep_id < fwnode_ep.id)) continue; fwnode_handle_put(best_ep); best_ep = fwnode_handle_get(ep); best_ep_id = fwnode_ep.id; } return best_ep; } EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_by_id); /** * fwnode_graph_get_endpoint_count - Count endpoints on a device node * @fwnode: The node related to a device * @flags: fwnode lookup flags * Count endpoints in a device node. * * If FWNODE_GRAPH_DEVICE_DISABLED flag is specified, also unconnected endpoints * and endpoints connected to disabled devices are counted. */ unsigned int fwnode_graph_get_endpoint_count(const struct fwnode_handle *fwnode, unsigned long flags) { struct fwnode_handle *ep; unsigned int count = 0; fwnode_graph_for_each_endpoint(fwnode, ep) { if (flags & FWNODE_GRAPH_DEVICE_DISABLED || fwnode_graph_remote_available(ep)) count++; } return count; } EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_count); /** * fwnode_graph_parse_endpoint - parse common endpoint node properties * @fwnode: pointer to endpoint fwnode_handle * @endpoint: pointer to the fwnode endpoint data structure * * Parse @fwnode representing a graph endpoint node and store the * information in @endpoint. The caller must hold a reference to * @fwnode. */ int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint) { memset(endpoint, 0, sizeof(*endpoint)); return fwnode_call_int_op(fwnode, graph_parse_endpoint, endpoint); } EXPORT_SYMBOL(fwnode_graph_parse_endpoint); const void *device_get_match_data(const struct device *dev) { return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data, dev); } EXPORT_SYMBOL_GPL(device_get_match_data); static unsigned int fwnode_graph_devcon_matches(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match, void **matches, unsigned int matches_len) { struct fwnode_handle *node; struct fwnode_handle *ep; unsigned int count = 0; void *ret; fwnode_graph_for_each_endpoint(fwnode, ep) { if (matches && count >= matches_len) { fwnode_handle_put(ep); break; } node = fwnode_graph_get_remote_port_parent(ep); if (!fwnode_device_is_available(node)) { fwnode_handle_put(node); continue; } ret = match(node, con_id, data); fwnode_handle_put(node); if (ret) { if (matches) matches[count] = ret; count++; } } return count; } static unsigned int fwnode_devcon_matches(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match, void **matches, unsigned int matches_len) { struct fwnode_handle *node; unsigned int count = 0; unsigned int i; void *ret; for (i = 0; ; i++) { if (matches && count >= matches_len) break; node = fwnode_find_reference(fwnode, con_id, i); if (IS_ERR(node)) break; ret = match(node, NULL, data); fwnode_handle_put(node); if (ret) { if (matches) matches[count] = ret; count++; } } return count; } /** * fwnode_connection_find_match - Find connection from a device node * @fwnode: Device node with the connection * @con_id: Identifier for the connection * @data: Data for the match function * @match: Function to check and convert the connection description * * Find a connection with unique identifier @con_id between @fwnode and another * device node. @match will be used to convert the connection description to * data the caller is expecting to be returned. */ void *fwnode_connection_find_match(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match) { unsigned int count; void *ret; if (!fwnode || !match) return NULL; count = fwnode_graph_devcon_matches(fwnode, con_id, data, match, &ret, 1); if (count) return ret; count = fwnode_devcon_matches(fwnode, con_id, data, match, &ret, 1); return count ? ret : NULL; } EXPORT_SYMBOL_GPL(fwnode_connection_find_match); /** * fwnode_connection_find_matches - Find connections from a device node * @fwnode: Device node with the connection * @con_id: Identifier for the connection * @data: Data for the match function * @match: Function to check and convert the connection description * @matches: (Optional) array of pointers to fill with matches * @matches_len: Length of @matches * * Find up to @matches_len connections with unique identifier @con_id between * @fwnode and other device nodes. @match will be used to convert the * connection description to data the caller is expecting to be returned * through the @matches array. * * If @matches is %NULL @matches_len is ignored and the total number of resolved * matches is returned. * * Return: Number of matches resolved, or negative errno. */ int fwnode_connection_find_matches(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match, void **matches, unsigned int matches_len) { unsigned int count_graph; unsigned int count_ref; if (!fwnode || !match) return -EINVAL; count_graph = fwnode_graph_devcon_matches(fwnode, con_id, data, match, matches, matches_len); if (matches) { matches += count_graph; matches_len -= count_graph; } count_ref = fwnode_devcon_matches(fwnode, con_id, data, match, matches, matches_len); return count_graph + count_ref; } EXPORT_SYMBOL_GPL(fwnode_connection_find_matches);
214 213 214 231 230 173 214 137 112 49 49 49 39 39 1 8 8 3 8 9 9 9 8 8 39 3 36 39 39 39 39 39 39 39 39 39 67 22 51 5 41 46 46 5 41 39 39 39 39 39 39 39 1 19 74 18 74 74 12 12 9 9 23 23 23 23 23 49 49 49 8 17 49 1 49 8 46 9 8 49 2 49 49 49 20 19 19 19 20 20 20 20 20 19 10 10 19 19 20 20 20 20 20 20 20 10 19 19 20 20 20 1 1 1 1 1 1 1 1 1 1 20 20 1 1 20 20 20 20 20 20 20 20 20 20 20 9 20 9 20 20 3 20 4 19 3 20 172 172 172 8 44 53 9 109 23 23 9 23 20 175 175 176 60 60 15 58 57 23 3 23 9 23 23 23 20 20 8 20 20 52 10 11 152 9 13 13 13 376 376 12 12 12 3 3 14 14 1 12 12 13 5 9 9 9 1 7 1 3 3 1 13 13 111 112 129 129 129 27 112 27 29 109 128 112 27 27 27 129 27 112 112 112 5 64 64 161 64 64 64 64 64 64 134 134 31 31 7 11 11 11 11 134 52 187 25 24 24 24 29 29 29 20 11 11 11 11 11 31 31 31 31 31 31 31 31 121 119 31 31 31 16 13 11 27 27 4 27 21 31 31 31 5 31 5 31 31 105 105 105 65 42 112 112 1 112 112 112 112 52 52 134 134 134 134 133 32 32 50 50 1 50 47 47 1 1 377 375 376 2 2 2 2 375 49 373 46 52 52 46 82 378 1 40 41 6 6 5 5 5 3 11 11 2 3 4 2 169 169 170 170 170 170 169 169 103 105 170 103 105 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/sched/signal.h> #include <linux/atomic.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> #include <net/tcp_states.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/transp_v6.h> #endif #include <net/mptcp.h> #include <net/hotdata.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include "protocol.h" #include "mib.h" #define CREATE_TRACE_POINTS #include <trace/events/mptcp.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct mptcp6_sock { struct mptcp_sock msk; struct ipv6_pinfo np; }; #endif enum { MPTCP_CMSG_TS = BIT(0), MPTCP_CMSG_INQ = BIT(1), }; static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); static void mptcp_check_send_data_fin(struct sock *sk); DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); static struct net_device mptcp_napi_dev; /* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) { return READ_ONCE(msk->wnd_end); } static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (sk->sk_prot == &tcpv6_prot) return &inet6_stream_ops; #endif WARN_ON_ONCE(sk->sk_prot != &tcp_prot); return &inet_stream_ops; } static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct socket *ssock; int err; err = mptcp_subflow_create_socket(sk, sk->sk_family, &ssock); if (err) return err; msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; WRITE_ONCE(msk->first, ssock->sk); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; subflow->subflow_id = msk->subflow_id++; /* This is the first subflow, always with id 0 */ WRITE_ONCE(subflow->local_id, 0); mptcp_sock_graft(msk->first, sk->sk_socket); iput(SOCK_INODE(ssock)); return 0; } /* If the MPC handshake is not started, returns the first subflow, * eventually allocating it. */ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; int ret; if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) return ERR_PTR(-EINVAL); if (!msk->first) { ret = __mptcp_socket_create(msk); if (ret) return ERR_PTR(ret); } return msk->first; } static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); __kfree_skb(skb); } static void mptcp_rmem_fwd_alloc_add(struct sock *sk, int size) { WRITE_ONCE(mptcp_sk(sk)->rmem_fwd_alloc, mptcp_sk(sk)->rmem_fwd_alloc + size); } static void mptcp_rmem_charge(struct sock *sk, int size) { mptcp_rmem_fwd_alloc_add(sk, -size); } static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from) { bool fragstolen; int delta; if (MPTCP_SKB_CB(from)->offset || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n", MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; /* note the fwd memory can reach a negative value after accounting * for the delta, but the later skb free will restore a non * negative one */ atomic_add(delta, &sk->sk_rmem_alloc); mptcp_rmem_charge(sk, delta); kfree_skb_partial(from, fragstolen); return true; } static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, struct sk_buff *from) { if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq) return false; return mptcp_try_coalesce((struct sock *)msk, to, from); } static void __mptcp_rmem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; mptcp_rmem_charge(sk, amount << PAGE_SHIFT); __sk_mem_reduce_allocated(sk, amount); } static void mptcp_rmem_uncharge(struct sock *sk, int size) { struct mptcp_sock *msk = mptcp_sk(sk); int reclaimable; mptcp_rmem_fwd_alloc_add(sk, size); reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); /* see sk_mem_uncharge() for the rationale behind the following schema */ if (unlikely(reclaimable >= PAGE_SIZE)) __mptcp_rmem_reclaim(sk, reclaimable); } static void mptcp_rfree(struct sk_buff *skb) { unsigned int len = skb->truesize; struct sock *sk = skb->sk; atomic_sub(len, &sk->sk_rmem_alloc); mptcp_rmem_uncharge(sk, len); } void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; skb->destructor = mptcp_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); mptcp_rmem_charge(sk, skb->truesize); } /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks */ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) { struct sock *sk = (struct sock *)msk; struct rb_node **p, *parent; u64 seq, end_seq, max_seq; struct sk_buff *skb1; seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; max_seq = atomic64_read(&msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d\n", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); if (after64(end_seq, max_seq)) { /* out of window */ mptcp_drop(sk, skb); pr_debug("oow by %lld, rcv_wnd_sent %llu\n", (unsigned long long)end_seq - (unsigned long)max_seq, (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } p = &msk->out_of_order_queue.rb_node; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) { rb_link_node(&skb->rbnode, NULL, p); rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); msk->ooo_last_skb = skb; goto end; } /* with 2 subflows, adding at end of ooo queue is quite likely * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); return; } /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); parent = &msk->ooo_last_skb->rbnode; p = &parent->rb_right; goto insert; } /* Find place to insert this segment. Handle overlaps on the way. */ parent = NULL; while (*p) { parent = *p; skb1 = rb_to_skb(parent); if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { p = &parent->rb_left; continue; } if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ mptcp_drop(sk, skb); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); return; } if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { /* partial overlap: * | skb | * | skb1 | * continue traversing */ } else { /* skb's seq == skb1's seq and skb covers skb1. * Replace skb1 with skb. */ rb_replace_node(&skb1->rbnode, &skb->rbnode, &msk->out_of_order_queue); mptcp_drop(sk, skb1); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); goto merge_right; } } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); return; } p = &parent->rb_right; } insert: /* Insert segment into RB tree. */ rb_link_node(&skb->rbnode, parent, p); rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); merge_right: /* Remove other segments covered by skb. */ while ((skb1 = skb_rb_next(skb)) != NULL) { if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) break; rb_erase(&skb1->rbnode, &msk->out_of_order_queue); mptcp_drop(sk, skb1); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); } /* If there is no skb after us, we are the last_skb ! */ if (!skb1) msk->ooo_last_skb = skb; end: skb_condense(skb); mptcp_set_owner_r(skb, sk); } static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) { struct mptcp_sock *msk = mptcp_sk(sk); int amt, amount; if (size <= msk->rmem_fwd_alloc) return true; size -= msk->rmem_fwd_alloc; amt = sk_mem_pages(size); amount = amt << PAGE_SHIFT; if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) return false; mptcp_rmem_fwd_alloc_add(sk, amount); return true; } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, struct sk_buff *skb, unsigned int offset, size_t copy_len) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; struct sk_buff *tail; bool has_rxtstamp; __skb_unlink(skb, &ssk->sk_receive_queue); skb_ext_reset(skb); skb_orphan(skb); /* try to fetch required memory from subflow */ if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); goto drop; } has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value */ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ msk->bytes_received += copy_len; WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail && mptcp_try_coalesce(sk, tail, skb)) return true; mptcp_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); return true; } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { mptcp_data_queue_ofo(msk, skb); return false; } /* old data, keep it simple and drop the whole pkt, sender * will retransmit as needed, if needed. */ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); drop: mptcp_drop(sk, skb); return false; } static void mptcp_stop_rtx_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_retransmit_timer); mptcp_sk(sk)->timer_ival = 0; } static void mptcp_close_wake_up(struct sock *sk) { if (sock_flag(sk, SOCK_DEAD)) return; sk->sk_state_change(sk); if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); else sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } /* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); return ((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && msk->write_seq == READ_ONCE(msk->snd_una); } static void mptcp_check_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); /* Look for an acknowledged DATA_FIN */ if (mptcp_pending_data_fin_ack(sk)) { WRITE_ONCE(msk->snd_data_fin_enable, 0); switch (sk->sk_state) { case TCP_FIN_WAIT1: mptcp_set_state(sk, TCP_FIN_WAIT2); break; case TCP_CLOSING: case TCP_LAST_ACK: mptcp_set_state(sk, TCP_CLOSE); break; } mptcp_close_wake_up(sk); } } /* can be called with no lock acquired */ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) { struct mptcp_sock *msk = mptcp_sk(sk); if (READ_ONCE(msk->rcv_data_fin) && ((1 << inet_sk_state_load(sk)) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); if (READ_ONCE(msk->ack_seq) == rcv_data_fin_seq) { if (seq) *seq = rcv_data_fin_seq; return true; } } return false; } static void mptcp_set_datafin_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 retransmits; retransmits = min_t(u32, icsk->icsk_retransmits, ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) { mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow) { const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? inet_csk(ssk)->icsk_timeout - jiffies : 0; } static void mptcp_set_timeout(struct sock *sk) { struct mptcp_subflow_context *subflow; long tout = 0; mptcp_for_each_subflow(mptcp_sk(sk), subflow) tout = max(tout, mptcp_timeout_from_subflow(subflow)); __mptcp_set_timeout(sk, tout); } static inline bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } void __mptcp_subflow_send_ack(struct sock *ssk) { if (tcp_can_send_ack(ssk)) tcp_send_ack(ssk); } static void mptcp_subflow_send_ack(struct sock *ssk) { bool slow; slow = lock_sock_fast(ssk); __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } static void mptcp_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; mptcp_for_each_subflow(msk, subflow) mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) { bool slow; slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); } static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) { const struct inet_connection_sock *icsk = inet_csk(ssk); u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending); const struct tcp_sock *tp = tcp_sk(ssk); return (ack_pending & ICSK_ACK_SCHED) && ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > READ_ONCE(icsk->icsk_ack.rcv_mss)) || (rx_empty && ack_pending & (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) { int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; int space = __mptcp_space(sk); bool cleanup, rx_empty; cleanup = (space > 0) && (space >= (old_space << 1)); rx_empty = !__mptcp_rmem(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) mptcp_subflow_cleanup_rbuf(ssk); } } static bool mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; bool ret = false; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. * msk->rcv_data_fin was set when parsing the incoming options * at the subflow level and the msk lock was not held, so this * is the first opportunity to act on the DATA_FIN and change * the msk state. * * If we are caught up to the sequence number of the incoming * DATA_FIN, send the DATA_ACK now and do state transition. If * not caught up, do nothing and let the recv code send DATA_ACK * when catching up. */ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ switch (sk->sk_state) { case TCP_ESTABLISHED: mptcp_set_state(sk, TCP_CLOSE_WAIT); break; case TCP_FIN_WAIT1: mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: mptcp_set_state(sk, TCP_CLOSE); break; default: /* Other states not expected */ WARN_ON_ONCE(1); break; } ret = true; if (!__mptcp_check_fallback(msk)) mptcp_send_ack(msk); mptcp_close_wake_up(sk); } return ret; } static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) { if (READ_ONCE(msk->allow_infinite_fallback)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONFALLBACK); mptcp_do_fallback(ssk); } else { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET); mptcp_subflow_reset(ssk); } } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk, unsigned int *bytes) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; bool done = false; int sk_rbuf; sk_rbuf = READ_ONCE(sk->sk_rcvbuf); if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); if (unlikely(ssk_rbuf > sk_rbuf)) { WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); sk_rbuf = ssk_rbuf; } } pr_debug("msk=%p ssk=%p\n", msk, ssk); tp = tcp_sk(ssk); do { u32 map_remaining, offset; u32 seq = tp->copied_seq; struct sk_buff *skb; bool fin; /* try to move as much data as available */ map_remaining = subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); skb = skb_peek(&ssk->sk_receive_queue); if (!skb) { /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), * a different CPU can have already processed the pending * data, stop here or we can enter an infinite loop */ if (!moved) done = true; break; } if (__mptcp_check_fallback(msk)) { /* Under fallback skbs have no MPTCP extension and TCP could * collapse them between the dummy map creation and the * current dequeue. Be sure to adjust the map size. */ map_remaining = skb->len; subflow->map_data_len = skb->len; } offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (fin) { done = true; seq++; } if (offset < skb->len) { size_t len = skb->len - offset; if (tp->urg_data) done = true; if (__mptcp_move_skb(msk, ssk, skb, offset, len)) moved += len; seq += len; if (unlikely(map_remaining < len)) { DEBUG_NET_WARN_ON_ONCE(1); mptcp_dss_corruption(msk, ssk); } } else { if (unlikely(!fin)) { DEBUG_NET_WARN_ON_ONCE(1); mptcp_dss_corruption(msk, ssk); } sk_eat_skb(ssk, skb); done = true; } WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { done = true; break; } } while (more_data_avail); if (moved > 0) msk->last_data_recv = tcp_jiffies32; *bytes += moved; return done; } static bool __mptcp_ofo_queue(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct sk_buff *skb, *tail; bool moved = false; struct rb_node *p; u64 end_seq; p = rb_first(&msk->out_of_order_queue); pr_debug("msk=%p empty=%d\n", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); while (p) { skb = rb_to_skb(p); if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) break; p = rb_next(p); rb_erase(&skb->rbnode, &msk->out_of_order_queue); if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq, msk->ack_seq))) { mptcp_drop(sk, skb); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); continue; } end_seq = MPTCP_SKB_CB(skb)->end_seq; tail = skb_peek_tail(&sk->sk_receive_queue); if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) { int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; /* skip overlapping data, if any */ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d\n", MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, delta); MPTCP_SKB_CB(skb)->offset += delta; MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } msk->bytes_received += end_seq - msk->ack_seq; WRITE_ONCE(msk->ack_seq, end_seq); moved = true; } return moved; } static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) { int err = sock_error(ssk); int ssk_state; if (!err) return false; /* only propagate errors on fallen-back sockets or * on MPC connect */ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) return false; /* We need to propagate only transition to CLOSE state. * Orphaned socket will see such state change via * subflow_sched_work_if_closed() and that path will properly * destroy the msk as needed. */ ssk_state = inet_sk_state_load(ssk); if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) mptcp_set_state(sk, ssk_state); WRITE_ONCE(sk->sk_err, -err); /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); sk_error_report(sk); return true; } void __mptcp_error_report(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_for_each_subflow(msk, subflow) if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow))) break; } /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; __mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) { if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); } /* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but * this is not a good place to change state. Let the workqueue * do it. */ if (mptcp_pending_data_fin(sk, NULL)) mptcp_schedule_work(sk); return moved > 0; } void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); int sk_rbuf, ssk_rbuf; /* The peer can send data while we are shutting down this * subflow at msk destruction time, but we must avoid enqueuing * more data to the msk receive queue */ if (unlikely(subflow->disposable)) return; ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); sk_rbuf = READ_ONCE(sk->sk_rcvbuf); if (unlikely(ssk_rbuf > sk_rbuf)) sk_rbuf = ssk_rbuf; /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ if (__mptcp_rmem(sk) > sk_rbuf) return; /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) sk->sk_data_ready(sk); mptcp_data_unlock(sk); } static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) { mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); WRITE_ONCE(msk->allow_infinite_fallback, false); mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); } static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; if (sk->sk_state != TCP_ESTABLISHED) return false; /* attach to msk socket only after we are sure we will deal with it * at close time */ if (sk->sk_socket && !ssk->sk_socket) mptcp_sock_graft(ssk, sk->sk_socket); mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; mptcp_sockopt_sync_locked(msk, ssk); mptcp_subflow_joined(msk, ssk); mptcp_stop_tout_timer(sk); __mptcp_propagate_sndbuf(sk, ssk); return true; } static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list) { struct mptcp_subflow_context *tmp, *subflow; struct mptcp_sock *msk = mptcp_sk(sk); list_for_each_entry_safe(subflow, tmp, join_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); list_move_tail(&subflow->node, &msk->conn_list); if (!__mptcp_finish_join(msk, ssk)) mptcp_subflow_reset(ssk); unlock_sock_fast(ssk, slow); } } static bool mptcp_rtx_timer_pending(struct sock *sk) { return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); } static void mptcp_reset_rtx_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; /* prevent rescheduling on close */ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) return; tout = mptcp_sk(sk)->timer_ival; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); } bool mptcp_schedule_work(struct sock *sk) { if (inet_sk_state_load(sk) != TCP_CLOSE && schedule_work(&mptcp_sk(sk)->work)) { /* each subflow already holds a reference to the sk, and the * workqueue is invoked by a subflow, so sk can't go away here. */ sock_hold(sk); return true; } return false; } static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; msk_owned_by_me(msk); mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->data_avail)) return mptcp_subflow_tcp_sock(subflow); } return NULL; } static bool mptcp_skb_can_collapse_to(u64 write_seq, const struct sk_buff *skb, const struct mptcp_ext *mpext) { if (!tcp_skb_can_collapse_to(skb)) return false; /* can collapse only if MPTCP level sequence is in order and this * mapping has not been xmitted yet */ return mpext && mpext->data_seq + mpext->data_len == write_seq && !mpext->frozen; } /* we can append data to the given data frag if: * - there is space available in the backing page_frag * - the data frag tail matches the current page_frag free offset * - the data frag end sequence number matches the current write seq */ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, const struct page_frag *pfrag, const struct mptcp_data_frag *df) { return df && pfrag->page == df->page && pfrag->size - pfrag->offset > 0 && pfrag->offset == (df->offset + df->data_len) && df->data_seq + df->data_len == msk->write_seq; } static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); sk_wmem_queued_add(sk, -len); } static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) { int len = dfrag->data_len + dfrag->overhead; list_del(&dfrag->list); dfrag_uncharge(sk, len); put_page(dfrag->page); } /* called under both the msk socket lock and the data lock */ static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; u64 snd_una; snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; if (unlikely(dfrag == msk->first_pending)) { /* in recovery mode can see ack after the current snd head */ if (WARN_ON_ONCE(!msk->recovery)) break; WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } dfrag_clear(sk, dfrag); } dfrag = mptcp_rtx_head(sk); if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; /* prevent wrap around in recovery mode */ if (unlikely(delta > dfrag->already_sent)) { if (WARN_ON_ONCE(!msk->recovery)) goto out; if (WARN_ON_ONCE(delta > dfrag->data_len)) goto out; dfrag->already_sent += delta - dfrag->already_sent; } dfrag->data_seq += delta; dfrag->offset += delta; dfrag->data_len -= delta; dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); } /* all retransmitted data acked, recovery completed */ if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt)) msk->recovery = false; out: if (snd_una == msk->snd_nxt && snd_una == msk->write_seq) { if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_rtx_timer(sk); } else { mptcp_reset_rtx_timer(sk); } if (mptcp_pending_data_fin_ack(sk)) mptcp_schedule_work(sk); } static void __mptcp_clean_una_wakeup(struct sock *sk) { lockdep_assert_held_once(&sk->sk_lock.slock); __mptcp_clean_una(sk); mptcp_write_space(sk); } static void mptcp_clean_una_wakeup(struct sock *sk) { mptcp_data_lock(sk); __mptcp_clean_una_wakeup(sk); mptcp_data_unlock(sk); } static void mptcp_enter_memory_pressure(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool first = true; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (first) tcp_enter_memory_pressure(ssk); sk_stream_moderate_sndbuf(ssk); first = false; } __mptcp_sync_sndbuf(sk); } /* ensure we get enough memory for the frag hdr, beyond some minimal amount of * data */ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), pfrag, sk->sk_allocation))) return true; mptcp_enter_memory_pressure(sk); return false; } static struct mptcp_data_frag * mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, int orig_offset) { int offset = ALIGN(orig_offset, sizeof(long)); struct mptcp_data_frag *dfrag; dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); dfrag->data_len = 0; dfrag->data_seq = msk->write_seq; dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); dfrag->offset = offset + sizeof(struct mptcp_data_frag); dfrag->already_sent = 0; dfrag->page = pfrag->page; return dfrag; } struct mptcp_sendmsg_info { int mss_now; int size_goal; u16 limit; u16 sent; unsigned int flags; bool data_lock_held; }; static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, u64 data_seq, int avail_size) { u64 window_end = mptcp_wnd_end(msk); u64 mptcp_snd_wnd; if (__mptcp_check_fallback(msk)) return avail_size; mptcp_snd_wnd = window_end - data_seq; avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); } return avail_size; } static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) { struct skb_ext *mpext = __skb_ext_alloc(gfp); if (!mpext) return false; __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); return true; } static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) { struct sk_buff *skb; skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { if (likely(__mptcp_add_ext(skb, gfp))) { skb_reserve(skb, MAX_TCP_HEADER); skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { mptcp_enter_memory_pressure(sk); } return NULL; } static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { struct sk_buff *skb; skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) return NULL; if (likely(sk_wmem_schedule(ssk, skb->truesize))) { tcp_skb_entail(ssk, skb); return skb; } tcp_skb_tsorted_anchor_cleanup(skb); kfree_skb(skb); return NULL; } static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) { gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; return __mptcp_alloc_tx_skb(sk, ssk, gfp); } /* note: this always recompute the csum on the whole skb, even * if we just appended a single frag. More status info needed */ static void mptcp_update_data_checksum(struct sk_buff *skb, int added) { struct mptcp_ext *mpext = mptcp_get_ext(skb); __wsum csum = ~csum_unfold(mpext->csum); int offset = skb->len - added; mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); } static void mptcp_update_infinite_map(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_ext *mpext) { if (!mpext) return; mpext->infinite_map = 1; mpext->data_len = 0; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); mptcp_subflow_ctx(ssk)->send_infinite_map = 0; pr_fallback(msk); mptcp_do_fallback(ssk); } #define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1)) static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) { u64 data_seq = dfrag->data_seq + info->sent; int offset = dfrag->offset + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; bool can_coalesce = false; bool reuse_skb = true; struct sk_buff *skb; size_t copy; int i; pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u\n", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); if (WARN_ON_ONCE(info->sent > info->limit || info->limit > dfrag->data_len)) return 0; if (unlikely(!__tcp_can_send(ssk))) return -EAGAIN; /* compute send limit */ if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE)) ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE; info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); copy = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb && copy > skb->len) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later * queue management operation, to avoid breaking the ext <-> * SSN association set here */ mpext = mptcp_get_ext(skb); if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { TCP_SKB_CB(skb)->eor = 1; tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } copy -= skb->len; } else { alloc_skb: skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); if (!skb) return -ENOMEM; i = skb_shinfo(skb)->nr_frags; reuse_skb = false; mpext = mptcp_get_ext(skb); } /* Zero window and all data acked? Probe. */ copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) { tcp_remove_empty_skb(ssk); return 0; } zero_window_probe = true; data_seq = snd_una - 1; copy = 1; } copy = min_t(size_t, copy, info->limit - info->sent); if (!sk_wmem_schedule(ssk, copy)) { tcp_remove_empty_skb(ssk); return -ENOMEM; } if (can_coalesce) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(dfrag->page); skb_fill_page_desc(skb, i, dfrag->page, offset, copy); } skb->len += copy; skb->data_len += copy; skb->truesize += copy; sk_wmem_queued_add(ssk, copy); sk_mem_charge(ssk, copy); WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); /* on skb reuse we just need to update the DSS len */ if (reuse_skb) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += copy; goto out; } memset(mpext, 0, sizeof(*mpext)); mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = copy; mpext->use_map = 1; mpext->dsn64 = 1; pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d\n", mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); if (zero_window_probe) { mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); tcp_push_pending_frames(ssk); return 0; } out: if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); if (mptcp_subflow_ctx(ssk)->send_infinite_map) mptcp_update_infinite_map(msk, ssk, mpext); trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; return copy; } #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ sizeof(struct tcphdr) - \ MAX_TCP_OPTION_SPACE - \ sizeof(struct ipv6hdr) - \ sizeof(struct frag_hdr)) struct subflow_send_info { struct sock *ssk; u64 linger_time; }; void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) { if (!subflow->stale) return; subflow->stale = 0; MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER); } bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) { if (unlikely(subflow->stale)) { u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp); if (subflow->stale_rcv_tstamp == rcv_tstamp) return false; mptcp_subflow_set_active(subflow); } return __mptcp_subflow_active(subflow); } #define SSK_MODE_ACTIVE 0 #define SSK_MODE_BACKUP 1 #define SSK_MODE_MAX 2 /* implement the mptcp packet scheduler; * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; u32 pace, burst, wmem; int i, nr_active = 0; struct sock *ssk; u64 linger_time; long tout = 0; /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; send_info[i].linger_time = -1; } mptcp_for_each_subflow(msk, subflow) { bool backup = subflow->backup || subflow->request_bkup; trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue; tout = max(tout, mptcp_timeout_from_subflow(subflow)); nr_active += !backup; pace = subflow->avg_pacing_rate; if (unlikely(!pace)) { /* init pacing rate from socket */ subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); pace = subflow->avg_pacing_rate; if (!pace) continue; } linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); if (linger_time < send_info[backup].linger_time) { send_info[backup].ssk = ssk; send_info[backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); /* pick the best backup if no other subflow is active */ if (!nr_active) send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; /* According to the blest algorithm, to avoid HoL blocking for the * faster flow, we need to: * - estimate the faster flow linger time * - use the above to estimate the amount of byte transferred * by the faster flow * - check that the amount of queued data is greter than the above, * otherwise do not use the picked, slower, subflow * We select the subflow with the shorter estimated time to flush * the queued mem, which basically ensure the above. We just need * to check that subflow has a non empty cwin. */ ssk = send_info[SSK_MODE_ACTIVE].ssk; if (!ssk || !sk_stream_memory_free(ssk)) return NULL; burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); if (!burst) return ssk; subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, burst + wmem); msk->snd_burst = burst; return ssk; } static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) { tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); release_sock(ssk); } static void mptcp_update_post_push(struct mptcp_sock *msk, struct mptcp_data_frag *dfrag, u32 sent) { u64 snd_nxt_new = dfrag->data_seq; dfrag->already_sent += sent; msk->snd_burst -= sent; snd_nxt_new += dfrag->already_sent; /* snd_nxt_new can be smaller than snd_nxt in case mptcp * is recovering after a failover. In that event, this re-sends * old segments. * * Thus compute snd_nxt_new candidate based on * the dfrag->data_seq that was sent and the data * that has been handed to the subflow for transmission * and skip update in case it was old dfrag. */ if (likely(after64(snd_nxt_new, msk->snd_nxt))) { msk->bytes_sent += snd_nxt_new - msk->snd_nxt; WRITE_ONCE(msk->snd_nxt, snd_nxt_new); } } void mptcp_check_and_set_pending(struct sock *sk) { if (mptcp_send_head(sk)) { mptcp_data_lock(sk); mptcp_sk(sk)->cb_flags |= BIT(MPTCP_PUSH_PENDING); mptcp_data_unlock(sk); } } static int __subflow_push_pending(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dfrag; int len, copied = 0, err = 0; while ((dfrag = mptcp_send_head(sk))) { info->sent = dfrag->already_sent; info->limit = dfrag->data_len; len = dfrag->data_len - dfrag->already_sent; while (len > 0) { int ret = 0; ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); if (ret <= 0) { err = copied ? : ret; goto out; } info->sent += ret; copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); if (msk->snd_burst <= 0 || !sk_stream_memory_free(ssk) || !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { err = copied; goto out; } mptcp_set_timeout(sk); } err = copied; out: if (err > 0) msk->last_data_sent = tcp_jiffies32; return err; } void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { .flags = flags, }; bool do_check_data_fin = false; int push_count = 1; while (mptcp_send_head(sk) && (push_count > 0)) { struct mptcp_subflow_context *subflow; int ret = 0; if (mptcp_sched_get_send(msk)) break; push_count = 0; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { mptcp_subflow_set_scheduled(subflow, false); prev_ssk = ssk; ssk = mptcp_subflow_tcp_sock(subflow); if (ssk != prev_ssk) { /* First check. If the ssk has changed since * the last round, release prev_ssk */ if (prev_ssk) mptcp_push_release(prev_ssk, &info); /* Need to lock the new subflow only if different * from the previous one, otherwise we are still * helding the relevant lock */ lock_sock(ssk); } push_count++; ret = __subflow_push_pending(sk, ssk, &info); if (ret <= 0) { if (ret != -EAGAIN || (1 << ssk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) push_count--; continue; } do_check_data_fin = true; } } } /* at this point we held the socket lock for the last subflow we used */ if (ssk) mptcp_push_release(ssk, &info); /* ensure the rtx timer is running */ if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); if (do_check_data_fin) mptcp_check_send_data_fin(sk); } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { .data_lock_held = true, }; bool keep_pushing = true; struct sock *xmit_ssk; int copied = 0; info.flags = 0; while (mptcp_send_head(sk) && keep_pushing) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); int ret = 0; /* check for a different subflow usage only after * spooling the first chunk of data */ if (first) { mptcp_subflow_set_scheduled(subflow, false); ret = __subflow_push_pending(sk, ssk, &info); first = false; if (ret <= 0) break; copied += ret; continue; } if (mptcp_sched_get_send(msk)) goto out; if (READ_ONCE(subflow->scheduled)) { mptcp_subflow_set_scheduled(subflow, false); ret = __subflow_push_pending(sk, ssk, &info); if (ret <= 0) keep_pushing = false; copied += ret; } mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { xmit_ssk = mptcp_subflow_tcp_sock(subflow); if (xmit_ssk != ssk) { mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SEND); keep_pushing = false; } } } } out: /* __mptcp_alloc_tx_skb could have released some wmem and we are * not going to flush it via release_sock() */ if (copied) { tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); if (msk->snd_data_fin_enable && msk->snd_nxt + 1 == msk->write_seq) mptcp_schedule_work(sk); } } static int mptcp_disconnect(struct sock *sk, int flags); static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, size_t len, int *copied_syn) { unsigned int saved_flags = msg->msg_flags; struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; int ret; /* on flags based fastopen the mptcp is supposed to create the * first subflow right now. Otherwise we are in the defer_connect * path, and the first subflow must be already present. * Since the defer_connect flag is cleared after the first succsful * fastopen attempt, no need to check for additional subflow status. */ if (msg->msg_flags & MSG_FASTOPEN) { ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) return PTR_ERR(ssk); } if (!msk->first) return -EINVAL; ssk = msk->first; lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; msk->fastopening = 1; ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); msk->fastopening = 0; msg->msg_flags = saved_flags; release_sock(ssk); /* do the blocking bits of inet_stream_connect outside the ssk socket lock */ if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) { ret = __inet_stream_connect(sk->sk_socket, msg->msg_name, msg->msg_namelen, msg->msg_flags, 1); /* Keep the same behaviour of plain TCP: zero the copied bytes in * case of any error, except timeout or signal */ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; } else if (ret && ret != -EINPROGRESS) { /* The disconnect() op called by tcp_sendmsg_fastopen()/ * __inet_stream_connect() can fail, due to looking check, * see mptcp_disconnect(). * Attempt it again outside the problematic scope. */ if (!mptcp_disconnect(sk, 0)) sk->sk_socket->state = SS_UNCONNECTED; } inet_clear_bit(DEFER_CONNECT, sk); return ret; } static int do_copy_data_nocache(struct sock *sk, int copy, struct iov_iter *from, char *to) { if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { if (!copy_from_iter_full_nocache(to, copy, from)) return -EFAULT; } else if (!copy_from_iter_full(to, copy, from)) { return -EFAULT; } return 0; } /* open-code sk_stream_memory_free() plus sent limit computation to * avoid indirect calls in fast-path. * Called under the msk socket lock, so we can avoid a bunch of ONCE * annotations. */ static u32 mptcp_send_limit(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); u32 limit, not_sent; if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) return 0; limit = mptcp_notsent_lowat(sk); if (limit == UINT_MAX) return UINT_MAX; not_sent = msk->write_seq - msk->snd_nxt; if (not_sent >= limit) return 0; return limit - not_sent; } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; size_t copied = 0; int ret = 0; long timeo; /* silently ignore everything else */ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN; lock_sock(sk); if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn); copied += copied_syn; if (ret == -EINPROGRESS && copied_syn > 0) goto out; else if (ret) goto do_error; } timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { ret = sk_stream_wait_connect(sk, &timeo); if (ret) goto do_error; } ret = -EPIPE; if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) goto do_error; pfrag = sk_page_frag(sk); while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; bool dfrag_collapsed; size_t psize, offset; u32 copy_limit; /* ensure fitting the notsent_lowat() constraint */ copy_limit = mptcp_send_limit(sk); if (!copy_limit) goto wait_for_memory; /* reuse tail pfrag, if possible, or carve a new one from the * page allocator */ dfrag = mptcp_pending_tail(sk); dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); if (!dfrag_collapsed) { if (!mptcp_page_frag_refill(sk, pfrag)) goto wait_for_memory; dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); frag_truesize = dfrag->overhead; } /* we do not bound vs wspace, to allow a single packet. * memory accounting will prevent execessive memory usage * anyway */ offset = dfrag->offset + dfrag->data_len; psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); psize = min_t(size_t, psize, copy_limit); total_ts = psize + frag_truesize; if (!sk_wmem_schedule(sk, total_ts)) goto wait_for_memory; ret = do_copy_data_nocache(sk, psize, &msg->msg_iter, page_address(dfrag->page) + offset); if (ret) goto do_error; /* data successfully copied into the write queue */ sk_forward_alloc_add(sk, -total_ts); copied += psize; dfrag->data_len += psize; frag_truesize += psize; pfrag->offset += frag_truesize; WRITE_ONCE(msk->write_seq, msk->write_seq + psize); /* charge data on mptcp pending queue to the msk socket * Note: we charge such data both to sk and ssk */ sk_wmem_queued_add(sk, frag_truesize); if (!dfrag_collapsed) { get_page(dfrag->page); list_add_tail(&dfrag->list, &msk->rtx_queue); if (!msk->first_pending) WRITE_ONCE(msk->first_pending, dfrag); } pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); continue; wait_for_memory: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto do_error; } if (copied) __mptcp_push_pending(sk, msg->msg_flags); out: release_sock(sk); return copied; do_error: if (copied) goto out; copied = sk_stream_error(sk, msg->msg_flags, ret); goto out; } static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct sk_buff *skb, *tmp; int copied = 0; skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); int err; if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, count); if (unlikely(err < 0)) { if (!copied) return err; break; } } if (MPTCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); *cmsg_flags |= MPTCP_CMSG_TS; } copied += count; if (count < data_len) { if (!(flags & MSG_PEEK)) { MPTCP_SKB_CB(skb)->offset += count; MPTCP_SKB_CB(skb)->map_seq += count; msk->bytes_consumed += count; } break; } if (!(flags & MSG_PEEK)) { /* we will bulk release the skb memory later */ skb->destructor = NULL; WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); __skb_unlink(skb, &msk->receive_queue); __kfree_skb(skb); msk->bytes_consumed += count; } if (copied >= len) break; } return copied; } /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. * * Only difference: Use highest rtt estimate of the subflows in use. */ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; u8 scaling_ratio = U8_MAX; u32 time, advmss = 1; u64 rtt_us, mstamp; msk_owned_by_me(msk); if (copied <= 0) return; if (!msk->rcvspace_init) mptcp_rcv_space_init(msk, msk->first); msk->rcvq_space.copied += copied; mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); rtt_us = msk->rcvq_space.rtt_us; if (rtt_us && time < (rtt_us >> 3)) return; rtt_us = 0; mptcp_for_each_subflow(msk, subflow) { const struct tcp_sock *tp; u64 sf_rtt_us; u32 sf_advmss; tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); sf_advmss = READ_ONCE(tp->advmss); rtt_us = max(sf_rtt_us, rtt_us); advmss = max(sf_advmss, advmss); scaling_ratio = min(tp->scaling_ratio, scaling_ratio); } msk->rcvq_space.rtt_us = rtt_us; msk->scaling_ratio = scaling_ratio; if (time < (rtt_us >> 3) || rtt_us == 0) return; if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { u64 rcvwin, grow; int rcvbuf; rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; window_clamp = mptcp_win_from_space(sk, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we * get drops at subflow level if skbs can't be moved to * the mptcp rx queue fast enough (announced rcv_win can * exceed ssk->sk_rcvbuf). */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk; bool slow; ssk = mptcp_subflow_tcp_sock(subflow); slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); } } } msk->rcvq_space.space = msk->rcvq_space.copied; new_measure: msk->rcvq_space.copied = 0; msk->rcvq_space.time = mstamp; } static void __mptcp_update_rmem(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (!msk->rmem_released) return; atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); mptcp_rmem_uncharge(sk, msk->rmem_released); WRITE_ONCE(msk->rmem_released, 0); } static void __mptcp_splice_receive_queue(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); } static bool __mptcp_move_skbs(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; bool ret, done; do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); bool slowpath; /* we can have data pending in the subflows only if the msk * receive buffer was full at subflow_data_ready() time, * that is an unlikely slow path. */ if (likely(!ssk)) break; slowpath = lock_sock_fast(ssk); mptcp_data_lock(sk); __mptcp_update_rmem(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); mptcp_data_unlock(sk); if (unlikely(ssk->sk_err)) __mptcp_error_report(sk); unlock_sock_fast(ssk, slowpath); } while (!done); /* acquire the data lock only if some input data is pending */ ret = moved > 0; if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || !skb_queue_empty_lockless(&sk->sk_receive_queue)) { mptcp_data_lock(sk); __mptcp_update_rmem(sk); ret |= __mptcp_ofo_queue(msk); __mptcp_splice_receive_queue(sk); mptcp_data_unlock(sk); } if (ret) mptcp_check_data_fin((struct sock *)msk); return !skb_queue_empty(&msk->receive_queue); } static unsigned int mptcp_inq_hint(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); const struct sk_buff *skb; skb = skb_peek(&msk->receive_queue); if (skb) { u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq; if (hint_val >= INT_MAX) return INT_MAX; return (unsigned int)hint_val; } if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 1; return 0; } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); struct scm_timestamping_internal tss; int copied = 0, cmsg_flags = 0; int target; long timeo; /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { copied = -ENOTCONN; goto out_err; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); if (unlikely(msk->recvmsg_inq)) cmsg_flags = MPTCP_CMSG_INQ; while (copied < len) { int err, bytes_read; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; goto out_err; } copied += bytes_read; /* be sure to advertise window change */ mptcp_cleanup_rbuf(msk); if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; /* only the MPTCP socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ if (copied >= target) break; if (copied) { if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || signal_pending(current)) break; } else { if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) { /* race breaker: the shutdown could be after the * previous receive queue check */ if (__mptcp_move_skbs(msk)) continue; break; } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } pr_debug("block timeout %ld\n", timeo); mptcp_rcv_space_adjust(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; goto out_err; } } mptcp_rcv_space_adjust(msk, copied); out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); if (cmsg_flags & MPTCP_CMSG_INQ) { unsigned int inq = mptcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } } pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", msk, skb_queue_empty_lockless(&sk->sk_receive_queue), skb_queue_empty(&msk->receive_queue), copied); release_sock(sk); return copied; } static void mptcp_retransmit_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; struct mptcp_sock *msk = mptcp_sk(sk); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* we need a process context to retransmit */ if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags)) mptcp_schedule_work(sk); } else { /* delegate our work to tcp_release_cb() */ __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags); } bh_unlock_sock(sk); sock_put(sk); } static void mptcp_tout_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); mptcp_schedule_work(sk); sock_put(sk); } /* Find an idle subflow. Return NULL if there is unacked data at tcp * level. * * A backup subflow is returned only if that is the only kind available. */ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!__mptcp_subflow_active(subflow)) continue; /* still data outstanding at TCP level? skip this */ if (!tcp_rtx_and_write_queues_empty(ssk)) { mptcp_pm_subflow_chk_stale(msk, ssk); min_stale_count = min_t(int, min_stale_count, subflow->stale_count); continue; } if (subflow->backup || subflow->request_bkup) { if (!backup) backup = ssk; continue; } if (!pick) pick = ssk; } if (pick) return pick; /* use backup only if there are no progresses anywhere */ return min_stale_count > 1 ? backup : NULL; } bool __mptcp_retransmit_pending_data(struct sock *sk) { struct mptcp_data_frag *cur, *rtx_head; struct mptcp_sock *msk = mptcp_sk(sk); if (__mptcp_check_fallback(msk)) return false; /* the closing socket has some data untransmitted and/or unacked: * some data in the mptcp rtx queue has not really xmitted yet. * keep it simple and re-inject the whole mptcp level rtx queue */ mptcp_data_lock(sk); __mptcp_clean_una_wakeup(sk); rtx_head = mptcp_rtx_head(sk); if (!rtx_head) { mptcp_data_unlock(sk); return false; } msk->recovery_snd_nxt = msk->snd_nxt; msk->recovery = true; mptcp_data_unlock(sk); msk->first_pending = rtx_head; msk->snd_burst = 0; /* be sure to clear the "sent status" on all re-injected fragments */ list_for_each_entry(cur, &msk->rtx_queue, list) { if (!cur->already_sent) break; cur->already_sent = 0; } return true; } /* flags for __mptcp_close_ssk() */ #define MPTCP_CF_PUSH BIT(1) #define MPTCP_CF_FASTCLOSE BIT(2) /* be sure to send a reset only if the caller asked for it, also * clean completely the subflow status when the subflow reaches * TCP_CLOSE state */ static void __mptcp_subflow_disconnect(struct sock *ssk, struct mptcp_subflow_context *subflow, unsigned int flags) { if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || (flags & MPTCP_CF_FASTCLOSE)) { /* The MPTCP code never wait on the subflow sockets, TCP-level * disconnect should never fail */ WARN_ON_ONCE(tcp_disconnect(ssk, 0)); mptcp_subflow_ctx_reset(subflow); } else { tcp_shutdown(ssk, SEND_SHUTDOWN); } } /* subflow sockets can be either outgoing (connect) or incoming * (accept). * * Outgoing subflows use in-kernel sockets. * Incoming subflows do not have their own 'struct socket' allocated, * so we need to use tcp_close() after detaching them from the mptcp * parent socket. */ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); bool dispose_it, need_push = false; /* If the first subflow moved to a close state before accept, e.g. due * to an incoming reset or listener shutdown, the subflow socket is * already deleted by inet_child_forget() and the mptcp socket can't * survive too. */ if (msk->in_accept_queue && msk->first == ssk && (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) { /* ensure later check in mptcp_worker() will dispose the msk */ sock_set_flag(sk, SOCK_DEAD); mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1)); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); mptcp_subflow_drop_ctx(ssk); goto out_release; } dispose_it = msk->free_first || ssk != msk->first; if (dispose_it) list_del(&subflow->node); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { /* be sure to force the tcp_close path * to generate the egress reset */ ssk->sk_lingertime = 0; sock_set_flag(ssk, SOCK_LINGER); subflow->send_fastclose = 1; } need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { __mptcp_subflow_disconnect(ssk, subflow, flags); release_sock(ssk); goto out; } subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops * the ssk has been already destroyed, we just need to release the * reference owned by msk; */ if (!inet_csk(ssk)->icsk_ulp_ops) { WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ __tcp_close(ssk, 0); /* close acquired an extra ref */ __sock_put(ssk); } out_release: __mptcp_subflow_error_report(sk, ssk); release_sock(ssk); sock_put(ssk); if (ssk == msk->first) WRITE_ONCE(msk->first, NULL); out: __mptcp_sync_sndbuf(sk); if (need_push) __mptcp_push_pending(sk, 0); /* Catch every 'all subflows closed' scenario, including peers silently * closing them, e.g. due to timeout. * For established sockets, allow an additional timeout before closing, * as the protocol can still create more subflows. */ if (list_is_singular(&msk->conn_list) && msk->first && inet_sk_state_load(msk->first) == TCP_CLOSE) { if (sk->sk_state != TCP_ESTABLISHED || msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) { mptcp_set_state(sk, TCP_CLOSE); mptcp_close_wake_up(sk); } else { mptcp_start_tout_timer(sk); } } } void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { /* The first subflow can already be closed and still in the list */ if (subflow->close_event_done) return; subflow->close_event_done = true; if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); /* subflow aborted before reaching the fully_established status * attempt the creation of the next subflow */ mptcp_pm_subflow_check_next(mptcp_sk(sk), subflow); __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) { return 0; } static void __mptcp_close_subflow(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); might_sleep(); mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int ssk_state = inet_sk_state_load(ssk); if (ssk_state != TCP_CLOSE && (ssk_state != TCP_CLOSE_WAIT || inet_sk_state_load(sk) != TCP_ESTABLISHED)) continue; /* 'subflow_data_ready' will re-sched once rx queue is empty */ if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) continue; mptcp_close_ssk(sk, ssk, subflow); } } static bool mptcp_close_tout_expired(const struct sock *sk) { if (!inet_csk(sk)->icsk_mtup.probe_timestamp || sk->sk_state == TCP_CLOSE) return false; return time_after32(tcp_jiffies32, inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk)); } static void mptcp_check_fastclose(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; if (likely(!READ_ONCE(msk->rcv_fastclose))) return; mptcp_token_destroy(msk); mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); bool slow; slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { mptcp_send_active_reset_reason(tcp_sk); tcp_set_state(tcp_sk, TCP_CLOSE); } unlock_sock_fast(tcp_sk, slow); } /* Mirror the tcp_reset() error propagation */ switch (sk->sk_state) { case TCP_SYN_SENT: WRITE_ONCE(sk->sk_err, ECONNREFUSED); break; case TCP_CLOSE_WAIT: WRITE_ONCE(sk->sk_err, EPIPE); break; case TCP_CLOSE: return; default: WRITE_ONCE(sk->sk_err, ECONNRESET); } mptcp_set_state(sk, TCP_CLOSE); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); /* the calling mptcp_worker will properly destroy the socket */ if (sock_flag(sk, SOCK_DEAD)) return; sk->sk_state_change(sk); sk_error_report(sk); } static void __mptcp_retrans(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; struct sock *ssk; int ret, err; u16 len = 0; mptcp_clean_una_wakeup(sk); /* first check ssk: need to kick "stale" logic */ err = mptcp_sched_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_retransmits++; mptcp_set_datafin_timeout(sk); mptcp_send_ack(msk); goto reset_timer; } if (!mptcp_send_head(sk)) return; goto reset_timer; } if (err) goto reset_timer; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { u16 copied = 0; mptcp_subflow_set_scheduled(subflow, false); ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); /* limit retransmission to the bytes already sent on some subflows */ info.sent = 0; info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; while (info.sent < info.limit) { ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) break; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; info.sent += ret; } if (copied) { len = max(copied, len); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); WRITE_ONCE(msk->allow_infinite_fallback, false); } release_sock(ssk); } } msk->bytes_retrans += len; dfrag->already_sent = max(dfrag->already_sent, len); reset_timer: mptcp_check_and_set_pending(sk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); } /* schedule the timeout timer for the relevant event: either close timeout * or mp_fail timeout. The close timeout takes precedence on the mp_fail one */ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout) { struct sock *sk = (struct sock *)msk; unsigned long timeout, close_timeout; if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp) return; close_timeout = (unsigned long)inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + mptcp_close_timeout(sk); /* the close timeout takes precedence on the fail one, and here at least one of * them is active */ timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout; sk_reset_timer(sk, &sk->sk_timer, timeout); } static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) { struct sock *ssk = msk->first; bool slow; if (!ssk) return; pr_debug("MP_FAIL doesn't respond, reset the subflow\n"); slow = lock_sock_fast(ssk); mptcp_subflow_reset(ssk); WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); unlock_sock_fast(ssk, slow); } static void mptcp_do_fastclose(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_set_state(sk, TCP_CLOSE); mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, MPTCP_CF_FASTCLOSE); } static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *sk = (struct sock *)msk; unsigned long fail_tout; int state; lock_sock(sk); state = sk->sk_state; if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; mptcp_check_fastclose(msk); mptcp_pm_nl_work(msk); mptcp_check_send_data_fin(sk); mptcp_check_data_fin_ack(sk); mptcp_check_data_fin(sk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(sk); if (mptcp_close_tout_expired(sk)) { mptcp_do_fastclose(sk); mptcp_close_wake_up(sk); } if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); goto unlock; } if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; if (fail_tout && time_after(jiffies, fail_tout)) mptcp_mp_fail_no_response(msk); unlock: release_sock(sk); sock_put(sk); } static void __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; WRITE_ONCE(msk->rmem_fwd_alloc, 0); WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; msk->subflow_id = 1; msk->last_data_sent = tcp_jiffies32; msk->last_data_recv = tcp_jiffies32; msk->last_ack_recv = tcp_jiffies32; mptcp_pm_data_init(msk); /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); timer_setup(&sk->sk_timer, mptcp_tout_timer, 0); } static void mptcp_ca_reset(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_assign_congestion_control(sk); strscpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name, sizeof(mptcp_sk(sk)->ca_name)); /* no need to keep a reference to the ops, the name will suffice */ tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = NULL; } static int mptcp_init_sock(struct sock *sk) { struct net *net = sock_net(sk); int ret; __mptcp_init_sock(sk); if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; rcu_read_lock(); ret = mptcp_init_sched(mptcp_sk(sk), mptcp_sched_find(mptcp_get_scheduler(net))); rcu_read_unlock(); if (ret) return ret; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will * propagate the correct value */ mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]); sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]); return 0; } static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); } void mptcp_cancel_work(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (cancel_work_sync(&msk->work)) __sock_put(sk); } void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) { lock_sock(ssk); switch (ssk->sk_state) { case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; fallthrough; case TCP_SYN_SENT: WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK)); break; default: if (__mptcp_check_fallback(mptcp_sk(sk))) { pr_debug("Fallback\n"); ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); /* simulate the data_fin ack reception to let the state * machine move forward */ WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); mptcp_schedule_work(sk); } else { pr_debug("Sending DATA_FIN on subflow %p\n", ssk); tcp_send_ack(ssk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); } break; } release_sock(ssk); } void mptcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); break; case TCP_CLOSE_WAIT: /* Unlike TCP, MPTCP sk would not have the TCP_SYN_RECV state: * MPTCP "accepted" sockets will be created later on. So no * transition from TCP_SYN_RECV to TCP_CLOSE_WAIT. */ break; default: if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); } inet_sk_state_store(sk, state); } static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int mptcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; mptcp_set_state(sk, ns); return next & TCP_ACTION_FIN; } static void mptcp_check_send_data_fin(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu\n", msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), msk->snd_nxt, msk->write_seq); /* we still need to enqueue subflows or not really shutting down, * skip this */ if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || mptcp_send_head(sk)) return; WRITE_ONCE(msk->snd_nxt, msk->write_seq); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); } } static void __mptcp_wr_shutdown(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d\n", msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, !!mptcp_send_head(sk)); /* will be ignored by fallback sockets */ WRITE_ONCE(msk->write_seq, msk->write_seq + 1); WRITE_ONCE(msk->snd_data_fin_enable, 1); mptcp_check_send_data_fin(sk); } static void __mptcp_destroy_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p\n", msk); might_sleep(); mptcp_stop_rtx_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; mptcp_release_sched(msk); sk->sk_prot->destroy(sk); WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc)); WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); sock_put(sk); } void __mptcp_unaccepted_force_close(struct sock *sk) { sock_set_flag(sk, SOCK_DEAD); mptcp_do_fastclose(sk); __mptcp_destroy_sock(sk); } static __poll_t mptcp_check_readable(struct sock *sk) { return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0; } static void mptcp_check_listen_stop(struct sock *sk) { struct sock *ssk; if (inet_sk_state_load(sk) != TCP_LISTEN) return; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); ssk = mptcp_sk(sk)->first; if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN)) return; lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); tcp_set_state(ssk, TCP_CLOSE); mptcp_subflow_queue_clean(sk, ssk); inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); release_sock(ssk); } bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; int subflows_alive = 0; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { mptcp_check_listen_stop(sk); mptcp_set_state(sk, TCP_CLOSE); goto cleanup; } if (mptcp_data_avail(msk) || timeout < 0) { /* If the msk has read data, or the caller explicitly ask it, * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose */ mptcp_do_fastclose(sk); timeout = 0; } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); } sk_stream_wait_close(sk, timeout); cleanup: /* orphan all the subflows */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); subflows_alive += ssk->sk_state != TCP_CLOSE; /* since the close timeout takes precedence on the fail one, * cancel the latter */ if (ssk == msk->first) subflow->fail_tout = 0; /* detach from the parent socket, but allow data_ready to * push incoming data into the mptcp stack, to properly ack it */ ssk->sk_socket = NULL; ssk->sk_wq = NULL; unlock_sock_fast(ssk, slow); } sock_orphan(sk); /* all the subflows are closed, only timeout can change the msk * state, let's not keep resources busy for no reasons */ if (subflows_alive == 0) mptcp_set_state(sk, TCP_CLOSE); sock_hold(sk); pr_debug("msk=%p state=%d\n", sk, sk->sk_state); mptcp_pm_connection_closed(msk); if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); do_cancel_work = true; } else { mptcp_start_tout_timer(sk); } return do_cancel_work; } static void mptcp_close(struct sock *sk, long timeout) { bool do_cancel_work; lock_sock(sk); do_cancel_work = __mptcp_close(sk, timeout); release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); sock_put(sk); } static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); struct ipv6_pinfo *msk6 = inet6_sk(msk); msk->sk_v6_daddr = ssk->sk_v6_daddr; msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; if (msk6 && ssk6) { msk6->saddr = ssk6->saddr; msk6->flow_label = ssk6->flow_label; } #endif inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; } static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under * msk->firstsocket lock). */ if (msk->fastopening) return -EBUSY; mptcp_check_listen_stop(sk); mptcp_set_state(sk, TCP_CLOSE); mptcp_stop_rtx_timer(sk); mptcp_stop_tout_timer(sk); mptcp_pm_connection_closed(msk); /* msk->subflow is still intact, the following will not free the first * subflow */ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; msk->recovery = false; WRITE_ONCE(msk->can_ack, false); WRITE_ONCE(msk->fully_established, false); WRITE_ONCE(msk->rcv_data_fin, false); WRITE_ONCE(msk->snd_data_fin_enable, false); WRITE_ONCE(msk->rcv_fastclose, false); WRITE_ONCE(msk->use_64bit_ack, false); WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); msk->bytes_consumed = 0; msk->bytes_acked = 0; msk->bytes_received = 0; msk->bytes_sent = 0; msk->bytes_retrans = 0; msk->rcvspace_init = 0; WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); return 0; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) { unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk) { const struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt; struct ipv6_pinfo *newnp; newnp = inet6_sk(newsk); rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); if (!opt) net_warn_ratelimited("%s: Failed to copy ip6 options\n", __func__); } RCU_INIT_POINTER(newnp->opt, opt); rcu_read_unlock(); } #endif static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk) { struct ip_options_rcu *inet_opt, *newopt = NULL; const struct inet_sock *inet = inet_sk(sk); struct inet_sock *newinet; newinet = inet_sk(newsk); rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); if (newopt) memcpy(newopt, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); else net_warn_ratelimited("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); rcu_read_unlock(); } struct sock *mptcp_sk_clone_init(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct sock *ssk, struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; if (!nsk) return NULL; #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (nsk->sk_family == AF_INET6) inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); #endif __mptcp_init_sock(nsk); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (nsk->sk_family == AF_INET6) mptcp_copy_ip6_options(nsk, sk); else #endif mptcp_copy_ip_options(nsk, sk); msk = mptcp_sk(nsk); WRITE_ONCE(msk->local_key, subflow_req->local_key); WRITE_ONCE(msk->token, subflow_req->token); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); WRITE_ONCE(msk->write_seq, subflow_req->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->snd_una, msk->write_seq); WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; mptcp_init_sched(msk, mptcp_sk(sk)->sched); /* passive msk is created after the first/MPC subflow */ msk->subflow_id = 2; sock_reset_flag(nsk, SOCK_RCU_FREE); security_inet_csk_clone(nsk, req); /* this can't race with mptcp_close(), as the msk is * not yet exposted to user-space */ mptcp_set_state(nsk, TCP_ESTABLISHED); /* The msk maintain a ref to each subflow in the connections list */ WRITE_ONCE(msk->first, ssk); subflow = mptcp_subflow_ctx(ssk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssk); /* new mpc subflow takes ownership of the newly * created mptcp socket */ mptcp_token_accept(subflow_req, msk); /* set msk addresses early to ensure mptcp_pm_get_local_id() * uses the correct data */ mptcp_copy_inaddrs(nsk, ssk); __mptcp_propagate_sndbuf(nsk, ssk); mptcp_rcv_space_init(msk, ssk); if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) __mptcp_subflow_fully_established(msk, subflow, mp_opt); bh_unlock_sock(nsk); /* note: the newly allocated socket refcount is 2 now */ return nsk; } void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) { const struct tcp_sock *tp = tcp_sk(ssk); msk->rcvspace_init = 1; msk->rcvq_space.copied = 0; msk->rcvq_space.rtt_us = 0; msk->rcvq_space.time = tp->tcp_mstamp; /* initial rcv_space offering made to peer */ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; } void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; __mptcp_clear_xmit(sk); /* join list will be eventually flushed (with rst) at sock lock release time */ mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ mptcp_data_lock(sk); skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); mptcp_data_unlock(sk); /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it */ sk_forward_alloc_add(sk, msk->rmem_fwd_alloc); WRITE_ONCE(msk->rmem_fwd_alloc, 0); mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); mptcp_free_local_addr_list(msk); } static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); /* allow the following to close even the initial subflow */ msk->free_first = 1; mptcp_destroy_common(msk, 0); sk_sockets_allocated_dec(sk); } void __mptcp_data_acked(struct sock *sk) { if (!sock_owned_by_user(sk)) __mptcp_clean_una(sk); else __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); } void __mptcp_check_push(struct sock *sk, struct sock *ssk) { if (!mptcp_send_head(sk)) return; if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, false); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); } #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ BIT(MPTCP_RETRANSMIT) | \ BIT(MPTCP_FLUSH_JOIN_LIST)) /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) __must_hold(&sk->sk_lock.slock) { struct mptcp_sock *msk = mptcp_sk(sk); for (;;) { unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED); struct list_head join_list; if (!flags) break; INIT_LIST_HEAD(&join_list); list_splice_init(&msk->join_list, &join_list); /* the following actions acquire the subflow socket lock * * 1) can't be invoked in atomic scope * 2) must avoid ABBA deadlock with msk socket spinlock: the RX * datapath acquires the msk socket spinlock while helding * the subflow socket lock */ msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) __mptcp_flush_join_list(sk, &join_list); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) __mptcp_retrans(sk); cond_resched(); spin_lock_bh(&sk->sk_lock.slock); } if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); if (unlikely(msk->cb_flags)) { /* be sure to sync the msk state before taking actions * depending on sk_state (MPTCP_ERROR_REPORT) * On sk release avoid actions depending on the first subflow */ if (__test_and_clear_bit(MPTCP_SYNC_STATE, &msk->cb_flags) && msk->first) __mptcp_sync_state(sk, msk->pending_state); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) __mptcp_sync_sndbuf(sk); } __mptcp_update_rmem(sk); } /* MP_JOIN client subflow must wait for 4th ack before sending any data: * TCP can't schedule delack timer before the subflow is fully established. * MPTCP uses the delack timer to do 3rd ack retransmissions */ static void schedule_3rdack_retransmission(struct sock *ssk) { struct inet_connection_sock *icsk = inet_csk(ssk); struct tcp_sock *tp = tcp_sk(ssk); unsigned long timeout; if (READ_ONCE(mptcp_subflow_ctx(ssk)->fully_established)) return; /* reschedule with a timeout above RTT, as we must look only for drop */ if (tp->srtt_us) timeout = usecs_to_jiffies(tp->srtt_us >> (3 - 1)); else timeout = TCP_TIMEOUT_INIT; timeout += jiffies; WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = timeout; sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } void mptcp_subflow_process_delegated(struct sock *ssk, long status) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; if (status & BIT(MPTCP_DELEGATE_SEND)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, true); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } if (status & BIT(MPTCP_DELEGATE_SNDBUF)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_sync_sndbuf(sk); else __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } if (status & BIT(MPTCP_DELEGATE_ACK)) schedule_3rdack_retransmission(ssk); } static int mptcp_hash(struct sock *sk) { /* should never be called, * we hash the TCP subflows not the MPTCP socket */ WARN_ON_ONCE(1); return 0; } static void mptcp_unhash(struct sock *sk) { /* called from sk_common_release(), but nothing to do here */ } static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p, ssk=%p\n", msk, msk->first); if (WARN_ON_ONCE(!msk->first)) return -EINVAL; return inet_csk_get_port(msk->first, snum); } void mptcp_finish_connect(struct sock *ssk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; struct sock *sk; subflow = mptcp_subflow_ctx(ssk); sk = subflow->conn; msk = mptcp_sk(sk); pr_debug("msk=%p, token=%u\n", sk, subflow->token); subflow->map_seq = subflow->iasn; subflow->map_subflow_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race * accessing the field below */ WRITE_ONCE(msk->local_key, subflow->local_key); mptcp_pm_new_connection(msk, ssk, 0); } void mptcp_sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; write_unlock_bh(&sk->sk_callback_lock); } bool mptcp_finish_join(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; bool ret = true; pr_debug("msk=%p, subflow=%p\n", msk, subflow); /* mptcp socket already closing? */ if (!mptcp_is_fully_established(parent)) { subflow->reset_reason = MPTCP_RST_EMPTCP; return false; } /* active subflow, already present inside the conn_list */ if (!list_empty(&subflow->node)) { mptcp_subflow_joined(msk, ssk); mptcp_propagate_sndbuf(parent, ssk); return true; } if (!mptcp_pm_allow_new_subflow(msk)) goto err_prohibited; /* If we can't acquire msk socket lock here, let the release callback * handle it */ mptcp_data_lock(parent); if (!sock_owned_by_user(parent)) { ret = __mptcp_finish_join(msk, ssk); if (ret) { sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); } } else { sock_hold(ssk); list_add_tail(&subflow->node, &msk->join_list); __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); } mptcp_data_unlock(parent); if (!ret) { err_prohibited: subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; } return true; } static void mptcp_shutdown(struct sock *sk, int how) { pr_debug("sk=%p, how=%d\n", sk, how); if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) __mptcp_wr_shutdown(sk); } static int mptcp_forward_alloc_get(const struct sock *sk) { return READ_ONCE(sk->sk_forward_alloc) + READ_ONCE(mptcp_sk(sk)->rmem_fwd_alloc); } static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) { const struct sock *sk = (void *)msk; u64 delta; if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) return 0; delta = msk->write_seq - v; if (__mptcp_check_fallback(msk) && msk->first) { struct tcp_sock *tp = tcp_sk(msk->first); /* the first subflow is disconnected after close - see * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq * so ignore that status, too. */ if (!((1 << msk->first->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) delta += READ_ONCE(tp->write_seq) - tp->snd_una; } if (delta > INT_MAX) delta = INT_MAX; return (int)delta; } static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) { struct mptcp_sock *msk = mptcp_sk(sk); bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; lock_sock(sk); __mptcp_move_skbs(msk); *karg = mptcp_inq_hint(sk); release_sock(sk); break; case SIOCOUTQ: slow = lock_sock_fast(sk); *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); unlock_sock_fast(sk, slow); break; case SIOCOUTQNSD: slow = lock_sock_fast(sk); *karg = mptcp_ioctl_outq(msk, msk->snd_nxt); unlock_sock_fast(sk, slow); break; default: return -ENOIOCTLCMD; } return 0; } static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); int err = -EINVAL; struct sock *ssk; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) return PTR_ERR(ssk); mptcp_set_state(sk, TCP_SYN_SENT); subflow = mptcp_subflow_ctx(ssk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info)) mptcp_subflow_early_fallback(msk, subflow); #endif if (subflow->request_mptcp) { if (mptcp_active_should_disable(sk)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEACTIVEDISABLED); mptcp_subflow_early_fallback(msk, subflow); } else if (mptcp_token_new_connect(ssk) < 0) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); mptcp_subflow_early_fallback(msk, subflow); } } WRITE_ONCE(msk->write_seq, subflow->idsn); WRITE_ONCE(msk->snd_nxt, subflow->idsn); WRITE_ONCE(msk->snd_una, subflow->idsn); if (likely(!__mptcp_check_fallback(msk))) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); /* if reaching here via the fastopen/sendmsg path, the caller already * acquired the subflow socket lock, too. */ if (!msk->fastopening) lock_sock(ssk); /* the following mirrors closely a very small chunk of code from * __inet_stream_connect() */ if (ssk->sk_state != TCP_CLOSE) goto out; if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) { err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len); if (err) goto out; } err = ssk->sk_prot->connect(ssk, uaddr, addr_len); if (err < 0) goto out; inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk)); out: if (!msk->fastopening) release_sock(ssk); /* on successful connect, the msk state will be moved to established by * subflow_finish_connect() */ if (unlikely(err)) { /* avoid leaving a dangling token in an unconnected socket */ mptcp_token_destroy(msk); mptcp_set_state(sk, TCP_CLOSE); return err; } mptcp_copy_inaddrs(sk, ssk); return 0; } static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, .setsockopt = mptcp_setsockopt, .getsockopt = mptcp_getsockopt, .shutdown = mptcp_shutdown, .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, .ioctl = mptcp_ioctl, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, .hash = mptcp_hash, .unhash = mptcp_unhash, .get_port = mptcp_get_port, .forward_alloc_get = mptcp_forward_alloc_get, .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, .no_autobind = true, }; static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *sk = sock->sk; int err = -EINVAL; lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { err = PTR_ERR(ssk); goto unlock; } if (sk->sk_family == AF_INET) err = inet_bind_sk(ssk, uaddr, addr_len); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (sk->sk_family == AF_INET6) err = inet6_bind_sk(ssk, uaddr, addr_len); #endif if (!err) mptcp_copy_inaddrs(sk, ssk); unlock: release_sock(sk); return err; } static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *sk = sock->sk; struct sock *ssk; int err; pr_debug("msk=%p\n", msk); lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto unlock; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { err = PTR_ERR(ssk); goto unlock; } mptcp_set_state(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); lock_sock(ssk); err = __inet_listen_sk(ssk, backlog); release_sock(ssk); mptcp_set_state(sk, inet_sk_state_load(ssk)); if (!err) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); mptcp_copy_inaddrs(sk, ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); } unlock: release_sock(sk); return err; } static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *newsk; pr_debug("msk=%p\n", msk); /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. */ ssk = READ_ONCE(msk->first); if (!ssk) return -EINVAL; pr_debug("ssk=%p, listener=%p\n", ssk, mptcp_subflow_ctx(ssk)); newsk = inet_csk_accept(ssk, arg); if (!newsk) return arg->err; pr_debug("newsk=%p, subflow is mptcp=%d\n", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; subflow = mptcp_subflow_ctx(newsk); new_mptcp_sock = subflow->conn; /* is_mptcp should be false if subflow->conn is missing, see * subflow_syn_recv_sock() */ if (WARN_ON_ONCE(!new_mptcp_sock)) { tcp_sk(newsk)->is_mptcp = 0; goto tcpfallback; } newsk = new_mptcp_sock; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); msk = mptcp_sk(newsk); msk->in_accept_queue = 0; /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { __mptcp_close_ssk(newsk, msk->first, mptcp_subflow_ctx(msk->first), 0); if (unlikely(list_is_singular(&msk->conn_list))) mptcp_set_state(newsk, TCP_CLOSE); } } else { tcpfallback: newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); /* we are being invoked after accepting a non-mp-capable * flow: sk is a tcp_sk, not an mptcp one. * * Hand the socket over to tcp so all further socket ops * bypass mptcp. */ WRITE_ONCE(newsock->sk->sk_socket->ops, mptcp_fallback_tcp_ops(newsock->sk)); } release_sock(newsk); return 0; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */ if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; return 0; } static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct sock *sk = sock->sk; struct mptcp_sock *msk; __poll_t mask = 0; u8 shutdown; int state; msk = mptcp_sk(sk); sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx\n", msk, state, msk->flags); if (state == TCP_LISTEN) { struct sock *ssk = READ_ONCE(msk->first); if (WARN_ON_ONCE(!ssk)) return 0; return inet_csk_listen_poll(ssk); } shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(sk); if (shutdown & SEND_SHUTDOWN) mask |= EPOLLOUT | EPOLLWRNORM; else mask |= mptcp_check_writeable(msk); } else if (state == TCP_SYN_SENT && inet_test_bit(DEFER_CONNECT, sk)) { /* cf tcp_poll() note about TFO */ mask |= EPOLLOUT | EPOLLWRNORM; } /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; return mask; } static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = mptcp_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet_getname, .poll = mptcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = mptcp_set_rcvlowat, }; static struct inet_protosw mptcp_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_prot, .ops = &mptcp_stream_ops, .flags = INET_PROTOSW_ICSK, }; static int mptcp_napi_poll(struct napi_struct *napi, int budget) { struct mptcp_delegated_action *delegated; struct mptcp_subflow_context *subflow; int work_done = 0; delegated = container_of(napi, struct mptcp_delegated_action, napi); while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bh_lock_sock_nested(ssk); if (!sock_owned_by_user(ssk)) { mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0)); } else { /* tcp_release_cb_override already processed * the action or will do at next release_sock(). * In both case must dequeue the subflow here - on the same * CPU that scheduled it. */ smp_wmb(); clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status); } bh_unlock_sock(ssk); sock_put(ssk); if (++work_done == budget) return budget; } /* always provide a 0 'work_done' argument, so that napi_complete_done * will not try accessing the NULL napi->dev ptr */ napi_complete_done(napi, 0); return work_done; } void __init mptcp_proto_init(void) { struct mptcp_delegated_action *delegated; int cpu; mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) panic("Failed to allocate MPTCP pcpu counter\n"); init_dummy_netdev(&mptcp_napi_dev); for_each_possible_cpu(cpu) { delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); INIT_LIST_HEAD(&delegated->head); netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll); napi_enable(&delegated->napi); } mptcp_subflow_init(); mptcp_pm_init(); mptcp_sched_init(); mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); inet_register_protosw(&mptcp_protosw); BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static const struct proto_ops mptcp_v6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = mptcp_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet6_getname, .poll = mptcp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet6_sendmsg, .recvmsg = inet6_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif .set_rcvlowat = mptcp_set_rcvlowat, }; static struct proto mptcp_v6_prot; static struct inet_protosw mptcp_v6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_v6_prot, .ops = &mptcp_v6_stream_ops, .flags = INET_PROTOSW_ICSK, }; int __init mptcp_proto_v6_init(void) { int err; mptcp_v6_prot = mptcp_prot; strscpy(mptcp_v6_prot.name, "MPTCPv6", sizeof(mptcp_v6_prot.name)); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); err = proto_register(&mptcp_v6_prot, 1); if (err) return err; err = inet6_register_protosw(&mptcp_v6_protosw); if (err) proto_unregister(&mptcp_v6_prot); return err; } #endif
668 255 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM signal #if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SIGNAL_H #include <linux/signal.h> #include <linux/sched.h> #include <linux/tracepoint.h> #define TP_STORE_SIGINFO(__entry, info) \ do { \ if (info == SEND_SIG_NOINFO) { \ __entry->errno = 0; \ __entry->code = SI_USER; \ } else if (info == SEND_SIG_PRIV) { \ __entry->errno = 0; \ __entry->code = SI_KERNEL; \ } else { \ __entry->errno = info->si_errno; \ __entry->code = info->si_code; \ } \ } while (0) #ifndef TRACE_HEADER_MULTI_READ enum { TRACE_SIGNAL_DELIVERED, TRACE_SIGNAL_IGNORED, TRACE_SIGNAL_ALREADY_PENDING, TRACE_SIGNAL_OVERFLOW_FAIL, TRACE_SIGNAL_LOSE_INFO, }; #endif /** * signal_generate - called when a signal is generated * @sig: signal number * @info: pointer to struct siginfo * @task: pointer to struct task_struct * @group: shared or private * @result: TRACE_SIGNAL_* * * Current process sends a 'sig' signal to 'task' process with * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV, * 'info' is not a pointer and you can't access its field. Instead, * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV * means that si_code is SI_KERNEL. */ TRACE_EVENT(signal_generate, TP_PROTO(int sig, struct kernel_siginfo *info, struct task_struct *task, int group, int result), TP_ARGS(sig, info, task, group, result), TP_STRUCT__entry( __field( int, sig ) __field( int, errno ) __field( int, code ) __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, group ) __field( int, result ) ), TP_fast_assign( __entry->sig = sig; TP_STORE_SIGINFO(__entry, info); memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->pid = task->pid; __entry->group = group; __entry->result = result; ), TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d", __entry->sig, __entry->errno, __entry->code, __entry->comm, __entry->pid, __entry->group, __entry->result) ); /** * signal_deliver - called when a signal is delivered * @sig: signal number * @info: pointer to struct siginfo * @ka: pointer to struct k_sigaction * * A 'sig' signal is delivered to current process with 'info' siginfo, * and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or * SIG_DFL. * Note that some signals reported by signal_generate tracepoint can be * lost, ignored or modified (by debugger) before hitting this tracepoint. * This means, this can show which signals are actually delivered, but * matching generated signals and delivered signals may not be correct. */ TRACE_EVENT(signal_deliver, TP_PROTO(int sig, struct kernel_siginfo *info, struct k_sigaction *ka), TP_ARGS(sig, info, ka), TP_STRUCT__entry( __field( int, sig ) __field( int, errno ) __field( int, code ) __field( unsigned long, sa_handler ) __field( unsigned long, sa_flags ) ), TP_fast_assign( __entry->sig = sig; TP_STORE_SIGINFO(__entry, info); __entry->sa_handler = (unsigned long)ka->sa.sa_handler; __entry->sa_flags = ka->sa.sa_flags; ), TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx", __entry->sig, __entry->errno, __entry->code, __entry->sa_handler, __entry->sa_flags) ); #endif /* _TRACE_SIGNAL_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
71 1 16 54 55 70 101 102 102 2 101 92 24 13 29 472 473 472 785 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 /* * Performance events x86 architecture code * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ #include <linux/perf_event.h> #include <linux/capability.h> #include <linux/notifier.h> #include <linux/hardirq.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kdebug.h> #include <linux/sched/mm.h> #include <linux/sched/clock.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/bitops.h> #include <linux/device.h> #include <linux/nospec.h> #include <linux/static_call.h> #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> #include <asm/smp.h> #include <asm/alternative.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/desc.h> #include <asm/ldt.h> #include <asm/unwind.h> #include <asm/uprobes.h> #include <asm/ibt.h> #include "perf_event.h" struct x86_pmu x86_pmu __read_mostly; static struct pmu pmu; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, .pmu = &pmu, }; DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); /* * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined * from just a typename, as opposed to an actual function. */ DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq); DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign); DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. */ DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ u64 x86_perf_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; u64 delta; if (unlikely(!hwc->event_base)) return 0; /* * Careful: an NMI might modify the previous event value. * * Our tactic to handle this is to first atomically read and * exchange a new raw count - then add that new-prev delta * count to the generic event atomically: */ prev_raw_count = local64_read(&hwc->prev_count); do { rdpmcl(hwc->event_base_rdpmc, new_raw_count); } while (!local64_try_cmpxchg(&hwc->prev_count, &prev_raw_count, new_raw_count)); /* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count. */ delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; local64_add(delta, &event->count); local64_sub(delta, &hwc->period_left); return new_raw_count; } /* * Find and validate any extra registers to set up. */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); struct hw_perf_event_extra *reg; struct extra_reg *er; reg = &event->hw.extra_reg; if (!extra_regs) return 0; for (er = extra_regs; er->msr; er++) { if (er->event != (config & er->config_mask)) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; /* Check if the extra msrs can be safely accessed*/ if (!er->extra_msr_access) return -ENXIO; reg->idx = er->idx; reg->config = event->attr.config1; reg->reg = er->msr; break; } return 0; } static atomic_t active_events; static atomic_t pmc_refcount; static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC static inline u64 get_possible_counter_mask(void) { u64 cntr_mask = x86_pmu.cntr_mask64; int i; if (!is_hybrid()) return cntr_mask; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64; return cntr_mask; } static bool reserve_pmc_hardware(void) { u64 cntr_mask = get_possible_counter_mask(); int i, end; for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } return true; eventsel_fail: end = i; for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_evntsel_nmi(x86_pmu_config_addr(i)); i = X86_PMC_IDX_MAX; perfctr_fail: end = i; for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_perfctr_nmi(x86_pmu_event_addr(i)); return false; } static void release_pmc_hardware(void) { u64 cntr_mask = get_possible_counter_mask(); int i; for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { release_perfctr_nmi(x86_pmu_event_addr(i)); release_evntsel_nmi(x86_pmu_config_addr(i)); } } #else static bool reserve_pmc_hardware(void) { return true; } static void release_pmc_hardware(void) {} #endif bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, unsigned long *fixed_cntr_mask) { u64 val, val_fail = -1, val_new= ~0; int i, reg, reg_fail = -1, ret = 0; int bios_fail = 0; int reg_safe = -1; /* * Check to see if the BIOS enabled any of the counters, if so * complain and bail. */ for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) { reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { bios_fail = 1; val_fail = val; reg_fail = reg; } else { reg_safe = i; } } if (*(u64 *)fixed_cntr_mask) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(i, pmu)) continue; if (val & (0x03ULL << i*4)) { bios_fail = 1; val_fail = val; reg_fail = reg; } } } /* * If all the counters are enabled, the below test will always * fail. The tools will also become useless in this scenario. * Just fail and disable the hardware counters. */ if (reg_safe == -1) { reg = reg_safe; goto msr_fail; } /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ reg = x86_pmu_event_addr(reg_safe); if (rdmsrl_safe(reg, &val)) goto msr_fail; val ^= 0xffffUL; ret = wrmsrl_safe(reg, val); ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; /* * We still allow the PMU driver to operate: */ if (bios_fail) { pr_cont("Broken BIOS detected, complain to your hardware vendor.\n"); pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); } return true; msr_fail: if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { pr_cont("PMU not available due to virtualization, using software events only.\n"); } else { pr_cont("Broken PMU hardware detected, using software events only.\n"); pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); } return false; } static void hw_perf_event_destroy(struct perf_event *event) { x86_release_hardware(); atomic_dec(&active_events); } void hw_perf_lbr_event_destroy(struct perf_event *event) { hw_perf_event_destroy(event); /* undo the lbr/bts event accounting */ x86_del_exclusive(x86_lbr_exclusive_lbr); } static inline int x86_pmu_initialized(void) { return x86_pmu.handle_irq != NULL; } static inline int set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) { struct perf_event_attr *attr = &event->attr; unsigned int cache_type, cache_op, cache_result; u64 config, val; config = attr->config; cache_type = (config >> 0) & 0xff; if (cache_type >= PERF_COUNT_HW_CACHE_MAX) return -EINVAL; cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX); cache_op = (config >> 8) & 0xff; if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) return -EINVAL; cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX); cache_result = (config >> 16) & 0xff; if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) return -EINVAL; cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result]; if (val == 0) return -ENOENT; if (val == -1) return -EINVAL; hwc->config |= val; attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result]; return x86_pmu_extra_regs(val, event); } int x86_reserve_hardware(void) { int err = 0; if (!atomic_inc_not_zero(&pmc_refcount)) { mutex_lock(&pmc_reserve_mutex); if (atomic_read(&pmc_refcount) == 0) { if (!reserve_pmc_hardware()) { err = -EBUSY; } else { reserve_ds_buffers(); reserve_lbr_buffers(); } } if (!err) atomic_inc(&pmc_refcount); mutex_unlock(&pmc_reserve_mutex); } return err; } void x86_release_hardware(void) { if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { release_pmc_hardware(); release_ds_buffers(); release_lbr_buffers(); mutex_unlock(&pmc_reserve_mutex); } } /* * Check if we can create event of a certain type (that no conflicting events * are present). */ int x86_add_exclusive(unsigned int what) { int i; /* * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. * LBR and BTS are still mutually exclusive. */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) goto out; if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { mutex_lock(&pmc_reserve_mutex); for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) goto fail_unlock; } atomic_inc(&x86_pmu.lbr_exclusive[what]); mutex_unlock(&pmc_reserve_mutex); } out: atomic_inc(&active_events); return 0; fail_unlock: mutex_unlock(&pmc_reserve_mutex); return -EBUSY; } void x86_del_exclusive(unsigned int what) { atomic_dec(&active_events); /* * See the comment in x86_add_exclusive(). */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return; atomic_dec(&x86_pmu.lbr_exclusive[what]); } int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; u64 config; if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); } if (attr->type == event->pmu->type) return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); if (attr->config >= x86_pmu.max_events) return -EINVAL; attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events); /* * The generic map: */ config = x86_pmu.event_map(attr->config); if (config == 0) return -ENOENT; if (config == -1LL) return -EINVAL; hwc->config |= config; return 0; } /* * check that branch_sample_type is compatible with * settings needed for precise_ip > 1 which implies * using the LBR to capture ALL taken branches at the * priv levels of the measurement */ static inline int precise_br_compat(struct perf_event *event) { u64 m = event->attr.branch_sample_type; u64 b = 0; /* must capture all branches */ if (!(m & PERF_SAMPLE_BRANCH_ANY)) return 0; m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_user) b |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) b |= PERF_SAMPLE_BRANCH_KERNEL; /* * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 */ return m == b; } int x86_pmu_max_precise(void) { int precise = 0; /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; /* Support for IP fixup */ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; if (x86_pmu.pebs_prec_dist) precise++; } return precise; } int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { int precise = x86_pmu_max_precise(); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; /* There's no sense in having PEBS for non sampling events: */ if (!is_sampling_event(event)) return -EINVAL; } /* * check that PEBS LBR correction does not conflict with * whatever the user is asking with attr->branch_sample_type */ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { u64 *br_type = &event->attr.branch_sample_type; if (has_branch_stack(event)) { if (!precise_br_compat(event)) return -EOPNOTSUPP; /* branch_sample_type is compatible */ } else { /* * user did not specify branch_sample_type * * For PEBS fixups, we capture all * the branches at the priv level of the * event. */ *br_type = PERF_SAMPLE_BRANCH_ANY; if (!event->attr.exclude_user) *br_type |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) *br_type |= PERF_SAMPLE_BRANCH_KERNEL; } } if (branch_sample_call_stack(event)) event->attach_state |= PERF_ATTACH_TASK_DATA; /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ event->hw.config = ARCH_PERFMON_EVENTSEL_INT; /* * Count user and OS events unless requested not to */ if (!event->attr.exclude_user) event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; if (!event->attr.exclude_kernel) event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; if (event->attr.type == event->pmu->type) event->hw.config |= x86_pmu_get_event_config(event); if (event->attr.sample_period && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) return -EINVAL; } /* sample_regs_user never support XMM registers */ if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK)) return -EINVAL; /* * Besides the general purpose registers, XMM registers may * be collected in PEBS on some platforms, e.g. Icelake */ if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) { if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS)) return -EINVAL; if (!event->attr.precise_ip) return -EINVAL; } return x86_setup_perfctr(event); } /* * Setup the hardware configuration for a given attr_type */ static int __x86_pmu_event_init(struct perf_event *event) { int err; if (!x86_pmu_initialized()) return -ENODEV; err = x86_reserve_hardware(); if (err) return err; atomic_inc(&active_events); event->destroy = hw_perf_event_destroy; event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); } void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(x86_pmu_config_addr(idx), val); if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu_config_addr(idx), val); if (is_counter_pair(hwc)) wrmsrl(x86_pmu_config_addr(idx + 1), 0); } } struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { return static_call(x86_pmu_guest_get_msrs)(nr, data); } EXPORT_SYMBOL_GPL(perf_guest_get_msrs); /* * There may be PMI landing after enabled=0. The PMI hitting could be before or * after disable_all. * * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. * It will not be re-enabled in the NMI handler again, because enabled=0. After * handling the NMI, disable_all will be called, which will not change the * state either. If PMI hits after disable_all, the PMU is already disabled * before entering NMI handler. The NMI handler will not change the state * either. * * So either situation is harmless. */ static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu_initialized()) return; if (!cpuc->enabled) return; cpuc->n_added = 0; cpuc->enabled = 0; barrier(); static_call(x86_pmu_disable_all)(); } void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) continue; __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } } static inline int is_x86_event(struct perf_event *event) { int i; if (!is_hybrid()) return event->pmu == &pmu; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) return true; } return false; } struct pmu *x86_get_pmu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); /* * All CPUs of the hybrid type have been offline. * The x86_get_pmu() should not be invoked. */ if (WARN_ON_ONCE(!cpuc->pmu)) return &pmu; return cpuc->pmu; } /* * Event scheduler state: * * Assign events iterating over all events and counters, beginning * with events with least weights first. Keep the current iterator * state in struct sched_state. */ struct sched_state { int weight; int event; /* event index */ int counter; /* counter index */ int unassigned; /* number of events to be assigned left */ int nr_gp; /* number of GP counters used */ u64 used; }; /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ #define SCHED_STATES_MAX 2 struct perf_sched { int max_weight; int max_events; int max_gp; int saved_states; struct event_constraint **constraints; struct sched_state state; struct sched_state saved[SCHED_STATES_MAX]; }; /* * Initialize iterator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax, int gpmax) { int idx; memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; sched->max_gp = gpmax; sched->constraints = constraints; for (idx = 0; idx < num; idx++) { if (constraints[idx]->weight == wmin) break; } sched->state.event = idx; /* start with min weight */ sched->state.weight = wmin; sched->state.unassigned = num; } static void perf_sched_save_state(struct perf_sched *sched) { if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) return; sched->saved[sched->saved_states] = sched->state; sched->saved_states++; } static bool perf_sched_restore_state(struct perf_sched *sched) { if (!sched->saved_states) return false; sched->saved_states--; sched->state = sched->saved[sched->saved_states]; /* this assignment didn't work out */ /* XXX broken vs EVENT_PAIR */ sched->state.used &= ~BIT_ULL(sched->state.counter); /* try the next one */ sched->state.counter++; return true; } /* * Select a counter for the current event to schedule. Return true on * success. */ static bool __perf_sched_find_counter(struct perf_sched *sched) { struct event_constraint *c; int idx; if (!sched->state.unassigned) return false; if (sched->state.event >= sched->max_events) return false; c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { u64 mask = BIT_ULL(idx); if (sched->state.used & mask) continue; sched->state.used |= mask; goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { u64 mask = BIT_ULL(idx); if (c->flags & PERF_X86_EVENT_PAIR) mask |= mask << 1; if (sched->state.used & mask) continue; if (sched->state.nr_gp++ >= sched->max_gp) return false; sched->state.used |= mask; goto done; } return false; done: sched->state.counter = idx; if (c->overlap) perf_sched_save_state(sched); return true; } static bool perf_sched_find_counter(struct perf_sched *sched) { while (!__perf_sched_find_counter(sched)) { if (!perf_sched_restore_state(sched)) return false; } return true; } /* * Go through all unassigned events and find the next one to schedule. * Take events with the least weight first. Return true on success. */ static bool perf_sched_next_event(struct perf_sched *sched) { struct event_constraint *c; if (!sched->state.unassigned || !--sched->state.unassigned) return false; do { /* next event */ sched->state.event++; if (sched->state.event >= sched->max_events) { /* next weight */ sched->state.event = 0; sched->state.weight++; if (sched->state.weight > sched->max_weight) return false; } c = sched->constraints[sched->state.event]; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ return true; } /* * Assign a counter for each event. */ int perf_assign_events(struct event_constraint **constraints, int n, int wmin, int wmax, int gpmax, int *assign) { struct perf_sched sched; perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); do { if (!perf_sched_find_counter(&sched)) break; /* failed */ if (assign) assign[sched.state.event] = sched.state.counter; } while (perf_sched_next_event(&sched)); return sched.state.unassigned; } EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { struct event_constraint *c; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; u64 used_mask = 0; /* * Compute the number of events already present; see x86_pmu_add(), * validate_group() and x86_pmu_commit_txn(). For the former two * cpuc->n_events hasn't been updated yet, while for the latter * cpuc->n_txn contains the number of events added in the current * transaction. */ n0 = cpuc->n_events; if (cpuc->txn_flags & PERF_PMU_TXN_ADD) n0 -= cpuc->n_txn; static_call_cond(x86_pmu_start_scheduling)(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = cpuc->event_constraint[i]; /* * Previously scheduled events should have a cached constraint, * while new events should not have one. */ WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); /* * Request constraints for new events; or for those events that * have a dynamic constraint -- for those the constraint can * change due to external factors (sibling state, allow_tfa). */ if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]); cpuc->event_constraint[i] = c; } wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } /* * fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { u64 mask; hwc = &cpuc->event_list[i]->hw; c = cpuc->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) break; /* constraint still honored */ if (!test_bit(hwc->idx, c->idxmsk)) break; mask = BIT_ULL(hwc->idx); if (is_counter_pair(hwc)) mask |= mask << 1; /* not already used */ if (used_mask & mask) break; used_mask |= mask; if (assign) assign[i] = hwc->idx; } /* slow path */ if (i != n) { int gpmax = x86_pmu_max_num_counters(cpuc->pmu); /* * Do not allow scheduling of more than half the available * generic counters. * * This helps avoid counter starvation of sibling thread by * ensuring at most half the counters cannot be in exclusive * mode. There is no designated counters for the limits. Any * N/2 counters can be used. This helps with events with * specific counter constraints. */ if (is_ht_workaround_enabled() && !cpuc->is_fake && READ_ONCE(cpuc->excl_cntrs->exclusive_present)) gpmax /= 2; /* * Reduce the amount of available counters to allow fitting * the extra Merge events needed by large increment events. */ if (x86_pmu.flags & PMU_FL_PAIR) { gpmax -= cpuc->n_pair; WARN_ON(gpmax <= 0); } unsched = perf_assign_events(cpuc->event_constraint, n, wmin, wmax, gpmax, assign); } /* * In case of success (unsched = 0), mark events as committed, * so we do not put_constraint() in case new events are added * and fail to be scheduled * * We invoke the lower level commit callback to lock the resource * * We do not need to do all of this in case we are called to * validate an event group (assign == NULL) */ if (!unsched && assign) { for (i = 0; i < n; i++) static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]); } else { for (i = n0; i < n; i++) { e = cpuc->event_list[i]; /* * release events that failed scheduling */ static_call_cond(x86_pmu_put_event_constraints)(cpuc, e); cpuc->event_constraint[i] = NULL; } } static_call_cond(x86_pmu_stop_scheduling)(cpuc); return unsched ? -EINVAL : 0; } static int add_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) { if (cpuc->n_metric == INTEL_TD_METRIC_NUM) return -EINVAL; cpuc->n_metric++; cpuc->n_txn_metric++; } return 0; } static void del_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) cpuc->n_metric--; } static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, int max_count, int n) { union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) return -EINVAL; if (n >= max_count + cpuc->n_metric) return -EINVAL; cpuc->event_list[n] = event; if (is_counter_pair(&event->hw)) { cpuc->n_pair++; cpuc->n_txn_pair++; } return 0; } /* * dogrp: true if must collect siblings events (group) * returns total number of events and error code */ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { struct perf_event *event; int n, max_count; max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu); /* current number of events already accepted */ n = cpuc->n_events; if (!cpuc->n_events) cpuc->pebs_output = 0; if (!cpuc->is_fake && leader->attr.precise_ip) { /* * For PEBS->PT, if !aux_event, the group leader (PT) went * away, the group was broken down and this singleton event * can't schedule any more. */ if (is_pebs_pt(leader) && !leader->aux_event) return -EINVAL; /* * pebs_output: 0: no PEBS so far, 1: PT, 2: DS */ if (cpuc->pebs_output && cpuc->pebs_output != is_pebs_pt(leader) + 1) return -EINVAL; cpuc->pebs_output = is_pebs_pt(leader) + 1; } if (is_x86_event(leader)) { if (collect_event(cpuc, leader, max_count, n)) return -EINVAL; n++; } if (!dogrp) return n; for_each_sibling_event(event, leader) { if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; if (collect_event(cpuc, event, max_count, n)) return -EINVAL; n++; } return n; } static inline void x86_assign_hw_event(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { struct hw_perf_event *hwc = &event->hw; int idx; idx = hwc->idx = cpuc->assign[i]; hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; static_call_cond(x86_pmu_assign)(event, idx); switch (hwc->idx) { case INTEL_PMC_IDX_FIXED_BTS: case INTEL_PMC_IDX_FIXED_VLBR: hwc->config_base = 0; hwc->event_base = 0; break; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: /* All the metric events are mapped onto the fixed counter 3. */ idx = INTEL_PMC_IDX_FIXED_SLOTS; fallthrough; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED); hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | INTEL_PMC_FIXED_RDPMC_BASE; break; default: hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); break; } } /** * x86_perf_rdpmc_index - Return PMC counter used for event * @event: the perf_event to which the PMC counter was assigned * * The counter assigned to this performance event may change if interrupts * are enabled. This counter should thus never be used while interrupts are * enabled. Before this function is used to obtain the assigned counter the * event should be checked for validity using, for example, * perf_event_read_local(), within the same interrupt disabled section in * which this counter is planned to be used. * * Return: The index of the performance monitoring counter assigned to * @perf_event. */ int x86_perf_rdpmc_index(struct perf_event *event) { lockdep_assert_irqs_disabled(); return event->hw.event_base_rdpmc; } static inline int match_prev_assignment(struct hw_perf_event *hwc, struct cpu_hw_events *cpuc, int i) { return hwc->idx == cpuc->assign[i] && hwc->last_cpu == smp_processor_id() && hwc->last_tag == cpuc->tags[i]; } static void x86_pmu_start(struct perf_event *event, int flags); static void x86_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *event; struct hw_perf_event *hwc; int i, added = cpuc->n_added; if (!x86_pmu_initialized()) return; if (cpuc->enabled) return; if (cpuc->n_added) { int n_running = cpuc->n_events - cpuc->n_added; /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() * * step1: save events moving to new counters */ for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; hwc = &event->hw; /* * we can avoid reprogramming counter if: * - assigned same counter as last time * - running on same CPU as last time * - no other event has used the counter since */ if (hwc->idx == -1 || match_prev_assignment(hwc, cpuc, i)) continue; /* * Ensure we don't accidentally enable a stopped * counter simply because we rescheduled. */ if (hwc->state & PERF_HES_STOPPED) hwc->state |= PERF_HES_ARCH; x86_pmu_stop(event, PERF_EF_UPDATE); } /* * step2: reprogram moved events into new counters */ for (i = 0; i < cpuc->n_events; i++) { event = cpuc->event_list[i]; hwc = &event->hw; if (!match_prev_assignment(hwc, cpuc, i)) x86_assign_hw_event(event, cpuc, i); else if (i < n_running) continue; if (hwc->state & PERF_HES_ARCH) continue; /* * if cpuc->enabled = 0, then no wrmsr as * per x86_pmu_enable_event() */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; perf_events_lapic_init(); } cpuc->enabled = 1; barrier(); static_call(x86_pmu_enable_all)(added); } DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. * To be called with the event disabled in hw: */ int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; if (unlikely(!hwc->event_base)) return 0; /* * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { left = period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } /* * Quirk: certain CPUs dont like it if just 1 hw_event is left: */ if (unlikely(left < 2)) left = 2; if (left > x86_pmu.max_period) left = x86_pmu.max_period; static_call_cond(x86_pmu_limit_period)(event, &left); this_cpu_write(pmc_prev_left[idx], left); /* * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ local64_set(&hwc->prev_count, (u64)-left); wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Sign extend the Merge event counter's upper 16 bits since * we currently declare a 48-bit counter width */ if (is_counter_pair(hwc)) wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); perf_event_update_userpage(event); return ret; } void x86_pmu_enable_event(struct perf_event *event) { if (__this_cpu_read(cpu_hw_events.enabled)) __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } /* * Add a single event to the PMU. * * The event is added to the group of enabled events * but only if it can be scheduled with existing events. */ static int x86_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc; int assign[X86_PMC_IDX_MAX]; int n, n0, ret; hwc = &event->hw; n0 = cpuc->n_events; ret = n = collect_events(cpuc, event, false); if (ret < 0) goto out; hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; if (!(flags & PERF_EF_START)) hwc->state |= PERF_HES_ARCH; /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole. * * If commit fails, we'll call ->del() on all events * for which ->add() was called. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) goto out; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); done_collect: /* * Commit the collect_events() state. See x86_pmu_del() and * x86_pmu_*_txn(). */ cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; /* * This is before x86_pmu_enable() will call x86_pmu_start(), * so we enable LBRs before an event needs them etc.. */ static_call_cond(x86_pmu_add)(event); ret = 0; out: return ret; } static void x86_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx = event->hw.idx; if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) return; if (WARN_ON_ONCE(idx == -1)) return; if (flags & PERF_EF_RELOAD) { WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); static_call(x86_pmu_set_period)(event); } event->hw.state = 0; cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); } void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; unsigned long *cntr_mask, *fixed_cntr_mask; struct event_constraint *pebs_constraints; struct cpu_hw_events *cpuc; u64 pebs, debugctl; int cpu, idx; guard(irqsave)(); cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_events, cpu); cntr_mask = hybrid(cpuc->pmu, cntr_mask); fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); if (!*(u64 *)cntr_mask) return; if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); pr_info("\n"); pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); if (pebs_constraints) { rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } if (x86_pmu.lbr_nr) { rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); } } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) { rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); rdmsrl(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); pr_info("CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; rdmsrl(x86_pmu_fixed_ctr_addr(idx), pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } } void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; if (test_bit(hwc->idx, cpuc->active_mask)) { static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { /* * Drain the remaining delta count out of a event * that we are disabling: */ static_call(x86_pmu_update)(event); hwc->state |= PERF_HES_UPTODATE; } } static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); int i; /* * If we're called during a txn, we only need to undo x86_pmu.add. * The events never got scheduled and ->cancel_txn will truncate * the event_list. * * XXX assumes any ->del() called during a TXN will only be on * an event added during that same TXN. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto do_del; __set_bit(event->hw.idx, cpuc->dirty); /* * Not a TXN, therefore cleanup properly. */ x86_pmu_stop(event, PERF_EF_UPDATE); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) break; } if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ return; /* If we have a newly added event; make sure to decrease n_added. */ if (i >= cpuc->n_events - cpuc->n_added) --cpuc->n_added; static_call_cond(x86_pmu_put_event_constraints)(cpuc, event); /* Delete the array entry. */ while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; cpuc->assign[i-1] = cpuc->assign[i]; } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; if (intel_cap.perf_metrics) del_nr_metric_event(cpuc, event); perf_event_update_userpage(event); do_del: /* * This is after x86_pmu_stop(); so we disable LBRs after any * event can need them etc.. */ static_call_cond(x86_pmu_del)(event); } int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; struct perf_event *event; int idx, handled = 0; u64 val; cpuc = this_cpu_ptr(&cpu_hw_events); /* * Some chipsets need to unmask the LVTPC in a particular spot * inside the nmi handler. As a result, the unmasking was pushed * into all the nmi handlers. * * This generic handler doesn't seem to have any issues where the * unmasking occurs so it was left at the top. */ apic_write(APIC_LVTPC, APIC_DM_NMI); for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (!test_bit(idx, cpuc->active_mask)) continue; event = cpuc->events[idx]; val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; /* * event overflow */ handled++; if (!static_call(x86_pmu_set_period)(event)) continue; perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } if (handled) inc_irq_stat(apic_perf_irqs); return handled; } void perf_events_lapic_init(void) { if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); } static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { u64 start_clock; u64 finish_clock; int ret; /* * All PMUs/events that share this PMI handler should make sure to * increment active_events for their events. */ if (!atomic_read(&active_events)) return NMI_DONE; start_clock = sched_clock(); ret = static_call(x86_pmu_handle_irq)(regs); finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); return ret; } NOKPROBE_SYMBOL(perf_event_nmi_handler); struct event_constraint emptyconstraint; struct event_constraint unconstrained; static int x86_pmu_prepare_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) cpuc->kfree_on_online[i] = NULL; if (x86_pmu.cpu_prepare) return x86_pmu.cpu_prepare(cpu); return 0; } static int x86_pmu_dead_cpu(unsigned int cpu) { if (x86_pmu.cpu_dead) x86_pmu.cpu_dead(cpu); return 0; } static int x86_pmu_online_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { kfree(cpuc->kfree_on_online[i]); cpuc->kfree_on_online[i] = NULL; } return 0; } static int x86_pmu_starting_cpu(unsigned int cpu) { if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); return 0; } static int x86_pmu_dying_cpu(unsigned int cpu) { if (x86_pmu.cpu_dying) x86_pmu.cpu_dying(cpu); return 0; } static void __init pmu_check_apic(void) { if (boot_cpu_has(X86_FEATURE_APIC)) return; x86_pmu.apic = 0; pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); pr_info("no hardware sampling interrupt available.\n"); /* * If we have a PMU initialized but no APIC * interrupts, we cannot sample hardware * events (user-space has to fall back and * sample via a hrtimer based software event): */ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; } static struct attribute_group x86_pmu_format_group __ro_after_init = { .name = "format", .attrs = NULL, }; ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); u64 config = 0; if (pmu_attr->id < x86_pmu.max_events) config = x86_pmu.event_map(pmu_attr->id); /* string trumps id */ if (pmu_attr->event_str) return sprintf(page, "%s\n", pmu_attr->event_str); return x86_pmu.events_sysfs_show(page, config); } EXPORT_SYMBOL_GPL(events_sysfs_show); ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_ht_attr *pmu_attr = container_of(attr, struct perf_pmu_events_ht_attr, attr); /* * Report conditional events depending on Hyper-Threading. * * This is overly conservative as usually the HT special * handling is not needed if the other CPU thread is idle. * * Note this does not (and cannot) handle the case when thread * siblings are invisible, for example with virtualization * if they are owned by some other guest. The user tool * has to re-read when a thread sibling gets onlined later. */ return sprintf(page, "%s", topology_max_smt_threads() > 1 ? pmu_attr->event_str_ht : pmu_attr->event_str_noht); } ssize_t events_hybrid_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_hybrid_attr *pmu_attr = container_of(attr, struct perf_pmu_events_hybrid_attr, attr); struct x86_hybrid_pmu *pmu; const char *str, *next_str; int i; if (hweight64(pmu_attr->pmu_type) == 1) return sprintf(page, "%s", pmu_attr->event_str); /* * Hybrid PMUs may support the same event name, but with different * event encoding, e.g., the mem-loads event on an Atom PMU has * different event encoding from a Core PMU. * * The event_str includes all event encodings. Each event encoding * is divided by ";". The order of the event encodings must follow * the order of the hybrid PMU index. */ pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); str = pmu_attr->event_str; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type)) continue; if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) { next_str = strchr(str, ';'); if (next_str) return snprintf(page, next_str - str + 1, "%s", str); else return sprintf(page, "%s", str); } str = strchr(str, ';'); str++; } return 0; } EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show); EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); EVENT_ATTR(cache-references, CACHE_REFERENCES ); EVENT_ATTR(cache-misses, CACHE_MISSES ); EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); EVENT_ATTR(branch-misses, BRANCH_MISSES ); EVENT_ATTR(bus-cycles, BUS_CYCLES ); EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); static struct attribute *empty_attrs; static struct attribute *events_attr[] = { EVENT_PTR(CPU_CYCLES), EVENT_PTR(INSTRUCTIONS), EVENT_PTR(CACHE_REFERENCES), EVENT_PTR(CACHE_MISSES), EVENT_PTR(BRANCH_INSTRUCTIONS), EVENT_PTR(BRANCH_MISSES), EVENT_PTR(BUS_CYCLES), EVENT_PTR(STALLED_CYCLES_FRONTEND), EVENT_PTR(STALLED_CYCLES_BACKEND), EVENT_PTR(REF_CPU_CYCLES), NULL, }; /* * Remove all undefined events (x86_pmu.event_map(id) == 0) * out of events_attr attributes. */ static umode_t is_visible(struct kobject *kobj, struct attribute *attr, int idx) { struct perf_pmu_events_attr *pmu_attr; if (idx >= x86_pmu.max_events) return 0; pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); /* str trumps id */ return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0; } static struct attribute_group x86_pmu_events_group __ro_after_init = { .name = "events", .attrs = events_attr, .is_visible = is_visible, }; ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) { u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); ssize_t ret; /* * We have whole page size to spend and just little data * to write, so we can safely use sprintf. */ ret = sprintf(page, "event=0x%02llx", event); if (umask) ret += sprintf(page + ret, ",umask=0x%02llx", umask); if (edge) ret += sprintf(page + ret, ",edge"); if (pc) ret += sprintf(page + ret, ",pc"); if (any) ret += sprintf(page + ret, ",any"); if (inv) ret += sprintf(page + ret, ",inv"); if (cmask) ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); ret += sprintf(page + ret, "\n"); return ret; } static struct attribute_group x86_pmu_attr_group; static struct attribute_group x86_pmu_caps_group; static void x86_pmu_static_call_update(void) { static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq); static_call_update(x86_pmu_disable_all, x86_pmu.disable_all); static_call_update(x86_pmu_enable_all, x86_pmu.enable_all); static_call_update(x86_pmu_enable, x86_pmu.enable); static_call_update(x86_pmu_disable, x86_pmu.disable); static_call_update(x86_pmu_assign, x86_pmu.assign); static_call_update(x86_pmu_add, x86_pmu.add); static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); static_call_update(x86_pmu_set_period, x86_pmu.set_period); static_call_update(x86_pmu_update, x86_pmu.update); static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling); static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling); static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx); static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); static_call_update(x86_pmu_filter, x86_pmu.filter); } static void _x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_update)(event); } void x86_pmu_show_pmu_cap(struct pmu *pmu) { pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; int err; pr_info("Performance Events: "); switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: err = intel_pmu_init(); break; case X86_VENDOR_AMD: err = amd_pmu_init(); break; case X86_VENDOR_HYGON: err = amd_pmu_init(); x86_pmu.name = "HYGON"; break; case X86_VENDOR_ZHAOXIN: case X86_VENDOR_CENTAUR: err = zhaoxin_pmu_init(); break; default: err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); err = 0; goto out_bad_pmu; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask)) goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); if (!x86_pmu.intel_ctrl) x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; if (!x86_pmu.config_mask) x86_pmu.config_mask = X86_RAW_EVENT_MASK; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64, 0, x86_pmu_num_counters(NULL), 0, 0); x86_pmu_format_group.attrs = x86_pmu.format_attrs; if (!x86_pmu.events_sysfs_show) x86_pmu_events_group.attrs = &empty_attrs; pmu.attr_update = x86_pmu.attr_update; if (!is_hybrid()) x86_pmu_show_pmu_cap(NULL); if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; if (!x86_pmu.guest_get_msrs) x86_pmu.guest_get_msrs = (void *)&__static_call_return0; if (!x86_pmu.set_period) x86_pmu.set_period = x86_perf_event_set_period; if (!x86_pmu.update) x86_pmu.update = x86_perf_event_update; x86_pmu_static_call_update(); /* * Install callbacks. Core will call them for each online * cpu. */ err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", x86_pmu_prepare_cpu, x86_pmu_dead_cpu); if (err) return err; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, "perf/x86:starting", x86_pmu_starting_cpu, x86_pmu_dying_cpu); if (err) goto out; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", x86_pmu_online_cpu, NULL); if (err) goto out1; if (!is_hybrid()) { err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); if (err) goto out2; } else { struct x86_hybrid_pmu *hybrid_pmu; int i, j; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { hybrid_pmu = &x86_pmu.hybrid_pmu[i]; hybrid_pmu->pmu = pmu; hybrid_pmu->pmu.type = -1; hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1); if (err) break; } if (i < x86_pmu.num_hybrid_pmus) { for (j = 0; j < i; j++) perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu); pr_warn("Failed to register hybrid PMUs\n"); kfree(x86_pmu.hybrid_pmu); x86_pmu.hybrid_pmu = NULL; x86_pmu.num_hybrid_pmus = 0; goto out2; } } return 0; out2: cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE); out1: cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); out: cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); out_bad_pmu: memset(&x86_pmu, 0, sizeof(x86_pmu)); return err; } early_initcall(init_hw_perf_events); static void x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_read)(event); } /* * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time * * We only support PERF_PMU_TXN_ADD transactions. Save the * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD * transactions. */ static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ cpuc->txn_flags = txn_flags; if (txn_flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_disable(pmu); __this_cpu_write(cpu_hw_events.n_txn, 0); __this_cpu_write(cpu_hw_events.n_txn_pair, 0); __this_cpu_write(cpu_hw_events.n_txn_metric, 0); } /* * Stop group events scheduling transaction * Clear the flag and pmu::enable() will perform the * schedulability test. */ static void x86_pmu_cancel_txn(struct pmu *pmu) { unsigned int txn_flags; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ txn_flags = cpuc->txn_flags; cpuc->txn_flags = 0; if (txn_flags & ~PERF_PMU_TXN_ADD) return; /* * Truncate collected array by the number of events added in this * transaction. See x86_pmu_add() and x86_pmu_*_txn(). */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); perf_pmu_enable(pmu); } /* * Commit group events scheduling transaction * Perform the group schedulability test as a whole * Return 0 if success * * Does not cancel the transaction on failure; expects the caller to do this. */ static int x86_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int assign[X86_PMC_IDX_MAX]; int n, ret; WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { cpuc->txn_flags = 0; return 0; } n = cpuc->n_events; if (!x86_pmu_initialized()) return -EAGAIN; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) return ret; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; } /* * a fake_cpuc is used to validate event groups. Due to * the extra reg logic, we need to also allocate a fake * per_core and per_cpu structure. Otherwise, group events * using extra reg may conflict without the kernel being * able to catch this when the last event gets added to * the group. */ static void free_fake_cpuc(struct cpu_hw_events *cpuc) { intel_cpuc_finish(cpuc); kfree(cpuc); } static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu) { struct cpu_hw_events *cpuc; int cpu; cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); if (!cpuc) return ERR_PTR(-ENOMEM); cpuc->is_fake = 1; if (is_hybrid()) { struct x86_hybrid_pmu *h_pmu; h_pmu = hybrid_pmu(event_pmu); if (cpumask_empty(&h_pmu->supported_cpus)) goto error; cpu = cpumask_first(&h_pmu->supported_cpus); } else cpu = raw_smp_processor_id(); cpuc->pmu = event_pmu; if (intel_cpuc_prepare(cpuc, cpu)) goto error; return cpuc; error: free_fake_cpuc(cpuc); return ERR_PTR(-ENOMEM); } /* * validate that we can schedule this event */ static int validate_event(struct perf_event *event) { struct cpu_hw_events *fake_cpuc; struct event_constraint *c; int ret = 0; fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); if (!c || !c->weight) ret = -EINVAL; if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); free_fake_cpuc(fake_cpuc); return ret; } /* * validate a single event group * * validation include: * - check events are compatible which each other * - events do not compete for the same counter * - number of events <= number of counters * * validation ensures the group can be loaded onto the * PMU if it was the only group available. */ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; int ret = -EINVAL, n; /* * Reject events from different hybrid PMUs. */ if (is_hybrid()) { struct perf_event *sibling; struct pmu *pmu = NULL; if (is_x86_event(leader)) pmu = leader->pmu; for_each_sibling_event(sibling, leader) { if (!is_x86_event(sibling)) continue; if (!pmu) pmu = sibling->pmu; else if (pmu != sibling->pmu) return ret; } } fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); /* * the event is not yet connected with its * siblings therefore we must first collect * existing siblings, then add the new event * before we can simulate the scheduling */ n = collect_events(fake_cpuc, leader, true); if (n < 0) goto out; fake_cpuc->n_events = n; n = collect_events(fake_cpuc, event, false); if (n < 0) goto out; fake_cpuc->n_events = 0; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); out: free_fake_cpuc(fake_cpuc); return ret; } static int x86_pmu_event_init(struct perf_event *event) { struct x86_hybrid_pmu *pmu = NULL; int err; if ((event->attr.type != event->pmu->type) && (event->attr.type != PERF_TYPE_HARDWARE) && (event->attr.type != PERF_TYPE_HW_CACHE)) return -ENOENT; if (is_hybrid() && (event->cpu != -1)) { pmu = hybrid_pmu(event->pmu); if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus)) return -ENOENT; } err = __x86_pmu_event_init(event); if (!err) { if (event->group_leader != event) err = validate_group(event); else err = validate_event(event); } if (err) { if (event->destroy) event->destroy(event); event->destroy = NULL; } if (READ_ONCE(x86_pmu.attr_rdpmc) && !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; return err; } void perf_clear_dirty_counters(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; /* Don't need to clear the assigned counter. */ for (i = 0; i < cpuc->n_events; i++) __clear_bit(cpuc->assign[i], cpuc->dirty); if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX)) return; for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { if (i >= INTEL_PMC_IDX_FIXED) { /* Metrics and fake events don't have corresponding HW counters. */ if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask))) continue; wrmsrl(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0); } else { wrmsrl(x86_pmu_event_addr(i), 0); } } bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX); } static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; /* * This function relies on not being called concurrently in two * tasks in the same mm. Otherwise one task could observe * perf_rdpmc_allowed > 1 and return all the way back to * userspace with CR4.PCE clear while another task is still * doing on_each_cpu_mask() to propagate CR4.PCE. * * For now, this can't happen because all callers hold mmap_lock * for write. If this changes, we'll need a different solution. */ mmap_assert_write_locked(mm); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) return 0; if (is_metric_idx(hwc->idx)) return INTEL_PMC_FIXED_RDPMC_METRICS + 1; else return hwc->event_base_rdpmc + 1; } static ssize_t get_attr_rdpmc(struct device *cdev, struct device_attribute *attr, char *buf) { return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); } static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) { static DEFINE_MUTEX(rdpmc_mutex); unsigned long val; ssize_t ret; ret = kstrtoul(buf, 0, &val); if (ret) return ret; if (val > 2) return -EINVAL; if (x86_pmu.attr_rdpmc_broken) return -ENOTSUPP; guard(mutex)(&rdpmc_mutex); if (val != x86_pmu.attr_rdpmc) { /* * Changing into or out of never available or always available, * aka perf-event-bypassing mode. This path is extremely slow, * but only root can trigger it, so it's okay. */ if (val == 0) static_branch_inc(&rdpmc_never_available_key); else if (x86_pmu.attr_rdpmc == 0) static_branch_dec(&rdpmc_never_available_key); if (val == 2) static_branch_inc(&rdpmc_always_available_key); else if (x86_pmu.attr_rdpmc == 2) static_branch_dec(&rdpmc_always_available_key); on_each_cpu(cr4_update_pce, NULL, 1); x86_pmu.attr_rdpmc = val; } return count; } static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); static struct attribute *x86_pmu_attrs[] = { &dev_attr_rdpmc.attr, NULL, }; static struct attribute_group x86_pmu_attr_group __ro_after_init = { .attrs = x86_pmu_attrs, }; static ssize_t max_precise_show(struct device *cdev, struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); } static DEVICE_ATTR_RO(max_precise); static struct attribute *x86_pmu_caps_attrs[] = { &dev_attr_max_precise.attr, NULL }; static struct attribute_group x86_pmu_caps_group __ro_after_init = { .name = "caps", .attrs = x86_pmu_caps_attrs, }; static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, &x86_pmu_events_group, &x86_pmu_caps_group, NULL, }; static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in); } static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc, struct perf_event_pmu_context *next_epc) { static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc); } void perf_check_microcode(void) { if (x86_pmu.check_microcode) x86_pmu.check_microcode(); } static int x86_pmu_check_period(struct perf_event *event, u64 value) { if (x86_pmu.check_period && x86_pmu.check_period(event, value)) return -EINVAL; if (value && x86_pmu.limit_period) { s64 left = value; x86_pmu.limit_period(event, &left); if (left > value) return -EINVAL; } return 0; } static int x86_pmu_aux_output_match(struct perf_event *event) { if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT)) return 0; if (x86_pmu.aux_output_match) return x86_pmu.aux_output_match(event); return 0; } static bool x86_pmu_filter(struct pmu *pmu, int cpu) { bool ret = false; static_call_cond(x86_pmu_filter)(pmu, cpu, &ret); return ret; } static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, .attr_groups = x86_pmu_attr_groups, .event_init = x86_pmu_event_init, .event_mapped = x86_pmu_event_mapped, .event_unmapped = x86_pmu_event_unmapped, .add = x86_pmu_add, .del = x86_pmu_del, .start = x86_pmu_start, .stop = x86_pmu_stop, .read = x86_pmu_read, .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, .swap_task_ctx = x86_pmu_swap_task_ctx, .check_period = x86_pmu_check_period, .aux_output_match = x86_pmu_aux_output_match, .filter = x86_pmu_filter, }; void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { struct cyc2ns_data data; u64 offset; userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); userpg->pmc_width = x86_pmu.cntval_bits; if (!using_native_sched_clock() || !sched_clock_stable()) return; cyc2ns_read_begin(&data); offset = data.cyc2ns_offset + __sched_clock_offset; /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. */ userpg->cap_user_time = 1; userpg->time_mult = data.cyc2ns_mul; userpg->time_shift = data.cyc2ns_shift; userpg->time_offset = offset - now; /* * cap_user_time_zero doesn't make sense when we're using a different * time base for the records. */ if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; userpg->time_zero = offset; } cyc2ns_read_end(); } /* * Determine whether the regs were taken from an irq/exception handler rather * than from perf_arch_fetch_caller_regs(). */ static bool perf_hw_regs(struct pt_regs *regs) { return regs->flags & X86_EFLAGS_FIXED; } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct unwind_state state; unsigned long addr; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } if (perf_callchain_store(entry, regs->ip)) return; if (perf_hw_regs(regs)) unwind_start(&state, current, regs, NULL); else unwind_start(&state, current, NULL, (void *)regs->sp); for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr || perf_callchain_store(entry, addr)) return; } } static inline int valid_user_frame(const void __user *fp, unsigned long size) { return __access_ok(fp, size); } static unsigned long get_segment_base(unsigned int segment) { struct desc_struct *desc; unsigned int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* IRQs are off, so this synchronizes with smp_store_release */ ldt = READ_ONCE(current->active_mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; desc = &ldt->entries[idx]; #else return 0; #endif } else { if (idx >= GDT_ENTRIES) return 0; desc = raw_cpu_ptr(gdt_page.gdt) + idx; } return get_desc_base(desc); } #ifdef CONFIG_UPROBES /* * Heuristic-based check if uprobe is installed at the function entry. * * Under assumption of user code being compiled with frame pointers, * `push %rbp/%ebp` is a good indicator that we indeed are. * * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. * If we get this wrong, captured stack trace might have one extra bogus * entry, but the rest of stack trace will still be meaningful. */ static bool is_uprobe_at_func_entry(struct pt_regs *regs) { struct arch_uprobe *auprobe; if (!current->utask) return false; auprobe = current->utask->auprobe; if (!auprobe) return false; /* push %rbp/%ebp */ if (auprobe->insn[0] == 0x55) return true; /* endbr64 (64-bit only) */ if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn)) return true; return false; } #else static bool is_uprobe_at_func_entry(struct pt_regs *regs) { return false; } #endif /* CONFIG_UPROBES */ #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; const struct stack_frame_ia32 __user *fp; u32 ret_addr; if (user_64bit_mode(regs)) return 0; cs_base = get_segment_base(regs->cs); ss_base = get_segment_base(regs->ss); fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); /* see perf_callchain_user() below for why we do this */ if (is_uprobe_at_func_entry(regs) && !get_user(ret_addr, (const u32 __user *)regs->sp)) perf_callchain_store(entry, ret_addr); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, cs_base + frame.return_address); fp = compat_ptr(ss_base + frame.next_frame); } pagefault_enable(); return 1; } #else static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { return 0; } #endif void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; const struct stack_frame __user *fp; unsigned long ret_addr; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } /* * We don't know what to do with VM86 stacks.. ignore them for now. */ if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) return; fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); if (!nmi_uaccess_okay()) return; if (perf_callchain_user32(regs, entry)) return; pagefault_disable(); /* * If we are called from uprobe handler, and we are indeed at the very * entry to user function (which is normally a `push %rbp` instruction, * under assumption of application being compiled with frame pointers), * we should read return address from *regs->sp before proceeding * to follow frame pointers, otherwise we'll skip immediate caller * as %rbp is not yet setup. */ if (is_uprobe_at_func_entry(regs) && !get_user(ret_addr, (const unsigned long __user *)regs->sp)) perf_callchain_store(entry, ret_addr); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, frame.return_address); fp = (void __user *)frame.next_frame; } pagefault_enable(); } /* * Deal with code segment offsets for the various execution modes: * * VM86 - the good olde 16 bit days, where the linear address is * 20 bits and we use regs->ip + 0x10 * regs->cs. * * IA32 - Where we need to look at GDT/LDT segment descriptor tables * to figure out what the 32bit base address is. * * X32 - has TIF_X32 set, but is running in x86_64 * * X86_64 - CS,DS,SS,ES are all zero based. */ static unsigned long code_segment_base(struct pt_regs *regs) { /* * For IA32 we look at the GDT/LDT segment base to convert the * effective IP to a linear address. */ #ifdef CONFIG_X86_32 /* * If we are in VM86 mode, add the segment offset to convert to a * linear address. */ if (regs->flags & X86_VM_MASK) return 0x10 * regs->cs; if (user_mode(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else if (user_mode(regs) && !user_64bit_mode(regs) && regs->cs != __USER32_CS) return get_segment_base(regs->cs); #endif return 0; } unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) { return regs->ip + code_segment_base(regs); } static unsigned long common_misc_flags(struct pt_regs *regs) { if (regs->flags & PERF_EFLAGS_EXACT) return PERF_RECORD_MISC_EXACT_IP; return 0; } static unsigned long guest_misc_flags(struct pt_regs *regs) { unsigned long guest_state = perf_guest_state(); if (!(guest_state & PERF_GUEST_ACTIVE)) return 0; if (guest_state & PERF_GUEST_USER) return PERF_RECORD_MISC_GUEST_USER; else return PERF_RECORD_MISC_GUEST_KERNEL; } static unsigned long host_misc_flags(struct pt_regs *regs) { if (user_mode(regs)) return PERF_RECORD_MISC_USER; else return PERF_RECORD_MISC_KERNEL; } unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) { unsigned long flags = common_misc_flags(regs); flags |= guest_misc_flags(regs); return flags; } unsigned long perf_arch_misc_flags(struct pt_regs *regs) { unsigned long flags = common_misc_flags(regs); flags |= host_misc_flags(regs); return flags; } void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { /* This API doesn't currently support enumerating hybrid PMUs. */ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) || !x86_pmu_initialized()) { memset(cap, 0, sizeof(*cap)); return; } /* * Note, hybrid CPU models get tracked as having hybrid PMUs even when * all E-cores are disabled via BIOS. When E-cores are disabled, the * base PMU holds the correct number of counters for P-cores. */ cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu_num_counters(NULL); cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); cap->bit_width_gp = x86_pmu.cntval_bits; cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; } EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); u64 perf_get_hw_event_config(int hw_event) { int max = x86_pmu.max_events; if (hw_event < max) return x86_pmu.event_map(array_index_nospec(hw_event, max)); return 0; } EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
3 3 3 3 3 3 2 2 2 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" struct devlink_sb { struct list_head list; unsigned int index; u32 size; u16 ingress_pools_count; u16 egress_pools_count; u16 ingress_tc_count; u16 egress_tc_count; }; static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb) { return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count; } static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink, unsigned int sb_index) { struct devlink_sb *devlink_sb; list_for_each_entry(devlink_sb, &devlink->sb_list, list) { if (devlink_sb->index == sb_index) return devlink_sb; } return NULL; } static bool devlink_sb_index_exists(struct devlink *devlink, unsigned int sb_index) { return devlink_sb_get_by_index(devlink, sb_index); } static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { if (attrs[DEVLINK_ATTR_SB_INDEX]) { u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]); struct devlink_sb *devlink_sb; devlink_sb = devlink_sb_get_by_index(devlink, sb_index); if (!devlink_sb) return ERR_PTR(-ENODEV); return devlink_sb; } return ERR_PTR(-EINVAL); } static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_sb_get_from_attrs(devlink, info->attrs); } static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb, struct nlattr **attrs, u16 *p_pool_index) { u16 val; if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX]) return -EINVAL; val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]); if (val >= devlink_sb_pool_count(devlink_sb)) return -EINVAL; *p_pool_index = val; return 0; } static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb, struct genl_info *info, u16 *p_pool_index) { return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs, p_pool_index); } static int devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs, enum devlink_sb_pool_type *p_pool_type) { u8 val; if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE]) return -EINVAL; val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]); if (val != DEVLINK_SB_POOL_TYPE_INGRESS && val != DEVLINK_SB_POOL_TYPE_EGRESS) return -EINVAL; *p_pool_type = val; return 0; } static int devlink_sb_pool_type_get_from_info(struct genl_info *info, enum devlink_sb_pool_type *p_pool_type) { return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type); } static int devlink_sb_th_type_get_from_attrs(struct nlattr **attrs, enum devlink_sb_threshold_type *p_th_type) { u8 val; if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]) return -EINVAL; val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]); if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC && val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC) return -EINVAL; *p_th_type = val; return 0; } static int devlink_sb_th_type_get_from_info(struct genl_info *info, enum devlink_sb_threshold_type *p_th_type) { return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type); } static int devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb, struct nlattr **attrs, enum devlink_sb_pool_type pool_type, u16 *p_tc_index) { u16 val; if (!attrs[DEVLINK_ATTR_SB_TC_INDEX]) return -EINVAL; val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]); if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS && val >= devlink_sb->ingress_tc_count) return -EINVAL; if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS && val >= devlink_sb->egress_tc_count) return -EINVAL; *p_tc_index = val; return 0; } static int devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, struct genl_info *info, enum devlink_sb_pool_type pool_type, u16 *p_tc_index) { return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs, pool_type, p_tc_index); } static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, u32 seq, int flags) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT, devlink_sb->ingress_pools_count)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT, devlink_sb->egress_pools_count)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT, devlink_sb->ingress_tc_count)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT, devlink_sb->egress_tc_count)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_sb *devlink_sb; struct sk_buff *msg; int err; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) return PTR_ERR(devlink_sb); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_sb_fill(msg, devlink, devlink_sb, DEVLINK_CMD_SB_NEW, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; int idx = 0; int err = 0; list_for_each_entry(devlink_sb, &devlink->sb_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_sb_fill(msg, devlink, devlink_sb, DEVLINK_CMD_SB_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_sb_get_dump_one); } static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, u16 pool_index, enum devlink_command cmd, u32 portid, u32 seq, int flags) { struct devlink_sb_pool_info pool_info; void *hdr; int err; err = devlink->ops->sb_pool_get(devlink, devlink_sb->index, pool_index, &pool_info); if (err) return err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE, pool_info.threshold_type)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_CELL_SIZE, pool_info.cell_size)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_sb *devlink_sb; struct sk_buff *msg; u16 pool_index; int err; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) return PTR_ERR(devlink_sb); err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) return err; if (!devlink->ops->sb_pool_get) return -EOPNOTSUPP; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index, DEVLINK_CMD_SB_POOL_NEW, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, struct devlink *devlink, struct devlink_sb *devlink_sb, u32 portid, u32 seq, int flags) { u16 pool_count = devlink_sb_pool_count(devlink_sb); u16 pool_index; int err; for (pool_index = 0; pool_index < pool_count; pool_index++) { if (*p_idx < start) { (*p_idx)++; continue; } err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index, DEVLINK_CMD_SB_POOL_NEW, portid, seq, flags); if (err) return err; (*p_idx)++; } return 0; } static int devlink_nl_sb_pool_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; int err = 0; int idx = 0; if (!devlink->ops->sb_pool_get) return 0; list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_pool_get_dumpit(msg, state->idx, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { state->idx = idx; break; } } return err; } int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_sb_pool_get_dump_one); } static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, u16 pool_index, u32 size, enum devlink_sb_threshold_type threshold_type, struct netlink_ext_ack *extack) { const struct devlink_ops *ops = devlink->ops; if (ops->sb_pool_set) return ops->sb_pool_set(devlink, sb_index, pool_index, size, threshold_type, extack); return -EOPNOTSUPP; } int devlink_nl_sb_pool_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; enum devlink_sb_threshold_type threshold_type; struct devlink_sb *devlink_sb; u16 pool_index; u32 size; int err; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) return PTR_ERR(devlink_sb); err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) return err; err = devlink_sb_th_type_get_from_info(info, &threshold_type); if (err) return err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_POOL_SIZE)) return -EINVAL; size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]); return devlink_sb_pool_set(devlink, devlink_sb->index, pool_index, size, threshold_type, info->extack); } static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_port *devlink_port, struct devlink_sb *devlink_sb, u16 pool_index, enum devlink_command cmd, u32 portid, u32 seq, int flags) { const struct devlink_ops *ops = devlink->ops; u32 threshold; void *hdr; int err; err = ops->sb_port_pool_get(devlink_port, devlink_sb->index, pool_index, &threshold); if (err) return err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) goto nla_put_failure; if (ops->sb_occ_port_pool_get) { u32 cur; u32 max; err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index, pool_index, &cur, &max); if (err && err != -EOPNOTSUPP) goto sb_occ_get_failure; if (!err) { if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) goto nla_put_failure; } } genlmsg_end(msg, hdr); return 0; nla_put_failure: err = -EMSGSIZE; sb_occ_get_failure: genlmsg_cancel(msg, hdr); return err; } int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; struct devlink_sb *devlink_sb; struct sk_buff *msg; u16 pool_index; int err; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) return PTR_ERR(devlink_sb); err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) return err; if (!devlink->ops->sb_port_pool_get) return -EOPNOTSUPP; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port, devlink_sb, pool_index, DEVLINK_CMD_SB_PORT_POOL_NEW, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, struct devlink *devlink, struct devlink_sb *devlink_sb, u32 portid, u32 seq, int flags) { struct devlink_port *devlink_port; u16 pool_count = devlink_sb_pool_count(devlink_sb); unsigned long port_index; u16 pool_index; int err; xa_for_each(&devlink->ports, port_index, devlink_port) { for (pool_index = 0; pool_index < pool_count; pool_index++) { if (*p_idx < start) { (*p_idx)++; continue; } err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port, devlink_sb, pool_index, DEVLINK_CMD_SB_PORT_POOL_NEW, portid, seq, flags); if (err) return err; (*p_idx)++; } } return 0; } static int devlink_nl_sb_port_pool_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; int idx = 0; int err = 0; if (!devlink->ops->sb_port_pool_get) return 0; list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_port_pool_get_dumpit(msg, state->idx, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { state->idx = idx; break; } } return err; } int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_sb_port_pool_get_dump_one); } static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack) { const struct devlink_ops *ops = devlink_port->devlink->ops; if (ops->sb_port_pool_set) return ops->sb_port_pool_set(devlink_port, sb_index, pool_index, threshold, extack); return -EOPNOTSUPP; } int devlink_nl_sb_port_pool_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; struct devlink_sb *devlink_sb; u16 pool_index; u32 threshold; int err; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) return PTR_ERR(devlink_sb); err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) return err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) return -EINVAL; threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); return devlink_sb_port_pool_set(devlink_port, devlink_sb->index, pool_index, threshold, info->extack); } static int devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_port *devlink_port, struct devlink_sb *devlink_sb, u16 tc_index, enum devlink_sb_pool_type pool_type, enum devlink_command cmd, u32 portid, u32 seq, int flags) { const struct devlink_ops *ops = devlink->ops; u16 pool_index; u32 threshold; void *hdr; int err; err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index, tc_index, pool_type, &pool_index, &threshold); if (err) return err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) goto nla_put_failure; if (ops->sb_occ_tc_port_bind_get) { u32 cur; u32 max; err = ops->sb_occ_tc_port_bind_get(devlink_port, devlink_sb->index, tc_index, pool_type, &cur, &max); if (err && err != -EOPNOTSUPP) return err; if (!err) { if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) goto nla_put_failure; } } genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; struct devlink_sb *devlink_sb; struct sk_buff *msg; enum devlink_sb_pool_type pool_type; u16 tc_index; int err; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) return PTR_ERR(devlink_sb); err = devlink_sb_pool_type_get_from_info(info, &pool_type); if (err) return err; err = devlink_sb_tc_index_get_from_info(devlink_sb, info, pool_type, &tc_index); if (err) return err; if (!devlink->ops->sb_tc_pool_bind_get) return -EOPNOTSUPP; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port, devlink_sb, tc_index, pool_type, DEVLINK_CMD_SB_TC_POOL_BIND_NEW, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, int start, int *p_idx, struct devlink *devlink, struct devlink_sb *devlink_sb, u32 portid, u32 seq, int flags) { struct devlink_port *devlink_port; unsigned long port_index; u16 tc_index; int err; xa_for_each(&devlink->ports, port_index, devlink_port) { for (tc_index = 0; tc_index < devlink_sb->ingress_tc_count; tc_index++) { if (*p_idx < start) { (*p_idx)++; continue; } err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port, devlink_sb, tc_index, DEVLINK_SB_POOL_TYPE_INGRESS, DEVLINK_CMD_SB_TC_POOL_BIND_NEW, portid, seq, flags); if (err) return err; (*p_idx)++; } for (tc_index = 0; tc_index < devlink_sb->egress_tc_count; tc_index++) { if (*p_idx < start) { (*p_idx)++; continue; } err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port, devlink_sb, tc_index, DEVLINK_SB_POOL_TYPE_EGRESS, DEVLINK_CMD_SB_TC_POOL_BIND_NEW, portid, seq, flags); if (err) return err; (*p_idx)++; } } return 0; } static int devlink_nl_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; int idx = 0; int err = 0; if (!devlink->ops->sb_tc_pool_bind_get) return 0; list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_tc_pool_bind_get_dumpit(msg, state->idx, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { state->idx = idx; break; } } return err; } int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_sb_tc_pool_bind_get_dump_one); } static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack) { const struct devlink_ops *ops = devlink_port->devlink->ops; if (ops->sb_tc_pool_bind_set) return ops->sb_tc_pool_bind_set(devlink_port, sb_index, tc_index, pool_type, pool_index, threshold, extack); return -EOPNOTSUPP; } int devlink_nl_sb_tc_pool_bind_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; enum devlink_sb_pool_type pool_type; struct devlink_sb *devlink_sb; u16 tc_index; u16 pool_index; u32 threshold; int err; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) return PTR_ERR(devlink_sb); err = devlink_sb_pool_type_get_from_info(info, &pool_type); if (err) return err; err = devlink_sb_tc_index_get_from_info(devlink_sb, info, pool_type, &tc_index); if (err) return err; err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) return err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) return -EINVAL; threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index, tc_index, pool_type, pool_index, threshold, info->extack); } int devlink_nl_sb_occ_snapshot_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; struct devlink_sb *devlink_sb; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) return PTR_ERR(devlink_sb); if (ops->sb_occ_snapshot) return ops->sb_occ_snapshot(devlink, devlink_sb->index); return -EOPNOTSUPP; } int devlink_nl_sb_occ_max_clear_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; struct devlink_sb *devlink_sb; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) return PTR_ERR(devlink_sb); if (ops->sb_occ_max_clear) return ops->sb_occ_max_clear(devlink, devlink_sb->index); return -EOPNOTSUPP; } int devl_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, u16 egress_tc_count) { struct devlink_sb *devlink_sb; lockdep_assert_held(&devlink->lock); if (devlink_sb_index_exists(devlink, sb_index)) return -EEXIST; devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL); if (!devlink_sb) return -ENOMEM; devlink_sb->index = sb_index; devlink_sb->size = size; devlink_sb->ingress_pools_count = ingress_pools_count; devlink_sb->egress_pools_count = egress_pools_count; devlink_sb->ingress_tc_count = ingress_tc_count; devlink_sb->egress_tc_count = egress_tc_count; list_add_tail(&devlink_sb->list, &devlink->sb_list); return 0; } EXPORT_SYMBOL_GPL(devl_sb_register); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, u16 egress_tc_count) { int err; devl_lock(devlink); err = devl_sb_register(devlink, sb_index, size, ingress_pools_count, egress_pools_count, ingress_tc_count, egress_tc_count); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_sb_register); void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index) { struct devlink_sb *devlink_sb; lockdep_assert_held(&devlink->lock); devlink_sb = devlink_sb_get_by_index(devlink, sb_index); WARN_ON(!devlink_sb); list_del(&devlink_sb->list); kfree(devlink_sb); } EXPORT_SYMBOL_GPL(devl_sb_unregister); void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) { devl_lock(devlink); devl_sb_unregister(devlink, sb_index); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_sb_unregister);
19 39 442 11 748 747 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 /* SPDX-License-Identifier: GPL-2.0-only */ /* include/net/xdp.h * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ #include <linux/bitfield.h> #include <linux/filter.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* skb_shared_info */ /** * DOC: XDP RX-queue information * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how * the driver has configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP * data-path read-access to RX-info for both kernel and bpf-side * (limited subset). * * For now, direct access is only safe while running in NAPI/softirq * context. Contents are read-mostly and must not be updated during * driver NAPI/softirq poll. * * The driver usage API is a register and unregister API. * * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver * naturally needs to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. */ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; /* XDP flags for ndo_xdp_xmit */ #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; }; struct page_pool; struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; struct xdp_mem_info mem; unsigned int napi_id; u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { struct net_device *dev; }; enum xdp_buff_flags { XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ }; struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_HAS_FRAGS; } static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) { xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { xdp->frame_sz = frame_sz; xdp->rxq = rxq; xdp->flags = 0; } static __always_inline void xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, int headroom, int data_len, const bool meta_valid) { unsigned char *data = hard_start + headroom; xdp->data_hard_start = hard_start; xdp->data = data; xdp->data_end = data + data_len; xdp->data_meta = meta_valid ? data : data + 1; } /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the * XDP/BPF data access to data_hard_end. Notice same area (and size) * is used for XDP_PASS, when constructing the SKB via build_skb(). */ #define xdp_data_hard_end(xdp) \ ((xdp)->data_hard_start + (xdp)->frame_sz - \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * xdp_get_shared_info_from_buff(struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); len += sinfo->xdp_frags_size; out: return len; } struct xdp_frame { void *data; u16 len; u16 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem info is valid on remote CPU. */ struct xdp_mem_info mem; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; void *xa; void *q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { /* bq->count will be zero'ed when bq->xa gets updated */ bq->xa = NULL; } static inline struct skb_shared_info * xdp_get_shared_info_from_frame(struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); return (struct skb_shared_info *)(data_hard_start + frame->frame_sz - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); } struct xdp_cpumap_stats { unsigned int redirect; unsigned int pass; unsigned int drop; }; /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { frame->data = NULL; frame->dev_rx = NULL; } static inline void xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, unsigned int size, unsigned int truesize, bool pfmemalloc) { skb_shinfo(skb)->nr_frags = nr_frags; skb->len += size; skb->data_len += size; skb->truesize += truesize; skb->pfmemalloc |= pfmemalloc; } /* Avoids inlining WARN macro in fast-path */ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; xdp->data_end = frame->data + frame->len; xdp->data_meta = frame->data - frame->metasize; xdp->frame_sz = frame->frame_sz; xdp->flags = frame->flags; } static inline int xdp_update_frame_from_buff(struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return -ENOSPC; /* Catch if driver didn't reserve tailroom for skb_shared_info */ if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { XDP_WARN("Driver BUG: missing reserved tailroom"); return -ENOSPC; } xdp_frame->data = xdp->data; xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; xdp_frame->frame_sz = xdp->frame_sz; xdp_frame->flags = xdp->flags; return 0; } /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) { struct xdp_frame *xdp_frame; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ xdp_frame->mem = xdp->rxq->mem; return xdp_frame; } void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); len += sinfo->xdp_frags_size; out: return len; } int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id, u32 frag_size); static inline int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id) { return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0); } void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. */ static __always_inline void xdp_set_data_meta_invalid(struct xdp_buff *xdp) { xdp->data_meta = xdp->data + 1; } static __always_inline bool xdp_data_meta_unsupported(const struct xdp_buff *xdp) { return unlikely(xdp->data_meta > xdp->data); } static inline bool xdp_metalen_invalid(unsigned long metalen) { unsigned long meta_max; meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); BUILD_BUG_ON(!__builtin_constant_p(meta_max)); return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; } struct xdp_attachment_info { struct bpf_prog *prog; u32 flags; }; struct netdev_bpf; void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE /* Define the relationship between xdp-rx-metadata kfunc and * various other entities: * - xdp_rx_metadata enum * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) * - kfunc name * - xdp_metadata_ops field */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ NETDEV_XDP_RX_METADATA_TIMESTAMP, \ bpf_xdp_metadata_rx_timestamp, \ xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ NETDEV_XDP_RX_METADATA_VLAN_TAG, \ bpf_xdp_metadata_rx_vlan_tag, \ xmo_rx_vlan_tag) \ enum xdp_rx_metadata { #define XDP_METADATA_KFUNC(name, _, __, ___) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, }; enum xdp_rss_hash_type { /* First part: Individual bits for L3/L4 types */ XDP_RSS_L3_IPV4 = BIT(0), XDP_RSS_L3_IPV6 = BIT(1), /* The fixed (L3) IPv4 and IPv6 headers can both be followed by * variable/dynamic headers, IPv4 called Options and IPv6 called * Extension Headers. HW RSS type can contain this info. */ XDP_RSS_L3_DYNHDR = BIT(2), /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in * addition to the protocol specific bit. This ease interaction with * SKBs and avoids reserving a fixed mask for future L4 protocol bits. */ XDP_RSS_L4 = BIT(3), /* L4 based hash, proto can be unknown */ XDP_RSS_L4_TCP = BIT(4), XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ XDP_RSS_L4_ICMP = BIT(8), /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, XDP_RSS_TYPE_L2 = XDP_RSS_TYPE_NONE, XDP_RSS_TYPE_L3_IPV4 = XDP_RSS_L3_IPV4, XDP_RSS_TYPE_L3_IPV6 = XDP_RSS_L3_IPV6, XDP_RSS_TYPE_L3_IPV4_OPT = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L3_IPV6_EX = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_ANY = XDP_RSS_L4, XDP_RSS_TYPE_L4_IPV4_TCP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, }; struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, u16 *vlan_tci); }; #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } static inline void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) { } static inline void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) { } static inline void xdp_features_clear_redirect_target(struct net_device *dev) { } #endif static inline void xdp_clear_features_flag(struct net_device *dev) { xdp_set_features_flag(dev, 0); } static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) act = xdp_master_redirect(xdp); } return act; } #endif /* __LINUX_NET_XDP_H__ */
5 1 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2003-2004, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2014-2015, Qualcomm Atheros, Inc. * * Rewrite: Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/err.h> #include <linux/scatterlist.h> #include <crypto/aead.h> #include "aead_api.h" int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, u8 *data, size_t data_len, u8 *mic) { size_t mic_len = crypto_aead_authsize(tfm); struct scatterlist sg[3]; struct aead_request *aead_req; int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); u8 *__aad; int ret; aead_req = kzalloc(reqsize + aad_len, GFP_ATOMIC); if (!aead_req) return -ENOMEM; __aad = (u8 *)aead_req + reqsize; memcpy(__aad, aad, aad_len); sg_init_table(sg, 3); sg_set_buf(&sg[0], __aad, aad_len); sg_set_buf(&sg[1], data, data_len); sg_set_buf(&sg[2], mic, mic_len); aead_request_set_tfm(aead_req, tfm); aead_request_set_crypt(aead_req, sg, sg, data_len, b_0); aead_request_set_ad(aead_req, sg[0].length); ret = crypto_aead_encrypt(aead_req); kfree_sensitive(aead_req); return ret; } int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, u8 *data, size_t data_len, u8 *mic) { size_t mic_len = crypto_aead_authsize(tfm); struct scatterlist sg[3]; struct aead_request *aead_req; int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); u8 *__aad; int err; if (data_len == 0) return -EINVAL; aead_req = kzalloc(reqsize + aad_len, GFP_ATOMIC); if (!aead_req) return -ENOMEM; __aad = (u8 *)aead_req + reqsize; memcpy(__aad, aad, aad_len); sg_init_table(sg, 3); sg_set_buf(&sg[0], __aad, aad_len); sg_set_buf(&sg[1], data, data_len); sg_set_buf(&sg[2], mic, mic_len); aead_request_set_tfm(aead_req, tfm); aead_request_set_crypt(aead_req, sg, sg, data_len + mic_len, b_0); aead_request_set_ad(aead_req, sg[0].length); err = crypto_aead_decrypt(aead_req); kfree_sensitive(aead_req); return err; } struct crypto_aead * aead_key_setup_encrypt(const char *alg, const u8 key[], size_t key_len, size_t mic_len) { struct crypto_aead *tfm; int err; tfm = crypto_alloc_aead(alg, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) return tfm; err = crypto_aead_setkey(tfm, key, key_len); if (err) goto free_aead; err = crypto_aead_setauthsize(tfm, mic_len); if (err) goto free_aead; return tfm; free_aead: crypto_free_aead(tfm); return ERR_PTR(err); } void aead_key_free(struct crypto_aead *tfm) { crypto_free_aead(tfm); }
14 618 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 /* SPDX-License-Identifier: GPL-2.0 */ /* linux/net/inet/arp.h */ #ifndef _ARP_H #define _ARP_H #include <linux/if_arp.h> #include <linux/hash.h> #include <net/neighbour.h> extern struct neigh_table arp_tbl; static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd) { u32 key = *(const u32 *)pkey; u32 val = key ^ hash32_ptr(dev); return val * hash_rnd[0]; } #ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) key = INADDR_ANY; return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } #else static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return NULL; } #endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); neigh_confirm(n); rcu_read_unlock(); } void arp_init(void); int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw); void arp_xmit(struct sk_buff *skb); #endif /* _ARP_H */
29 29 29 29 29 29 29 28 29 29 29 29 29 29 29 18 18 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 // SPDX-License-Identifier: GPL-2.0-or-later /* incoming call handling * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/errqueue.h> #include <linux/udp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/icmp.h> #include <linux/gfp.h> #include <linux/circ_buf.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <net/ip.h> #include "ar-internal.h" static void rxrpc_dummy_notify(struct sock *sk, struct rxrpc_call *call, unsigned long user_call_ID) { } /* * Preallocate a single service call, connection and peer and, if possible, * give them a user ID and attach the user's side of the ID to them. */ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, struct rxrpc_backlog *b, rxrpc_notify_rx_t notify_rx, rxrpc_user_attach_call_t user_attach_call, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { struct rxrpc_call *call, *xcall; struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); struct rb_node *parent, **pp; int max, tmp; unsigned int size = RXRPC_BACKLOG_MAX; unsigned int head, tail, call_head, call_tail; max = rx->sk.sk_max_ack_backlog; tmp = rx->sk.sk_ack_backlog; if (tmp >= max) { _leave(" = -ENOBUFS [full %u]", max); return -ENOBUFS; } max -= tmp; /* We don't need more conns and peers than we have calls, but on the * other hand, we shouldn't ever use more peers than conns or conns * than calls. */ call_head = b->call_backlog_head; call_tail = READ_ONCE(b->call_backlog_tail); tmp = CIRC_CNT(call_head, call_tail, size); if (tmp >= max) { _leave(" = -ENOBUFS [enough %u]", tmp); return -ENOBUFS; } max = tmp + 1; head = b->peer_backlog_head; tail = READ_ONCE(b->peer_backlog_tail); if (CIRC_CNT(head, tail, size) < max) { struct rxrpc_peer *peer; peer = rxrpc_alloc_peer(rx->local, gfp, rxrpc_peer_new_prealloc); if (!peer) return -ENOMEM; b->peer_backlog[head] = peer; smp_store_release(&b->peer_backlog_head, (head + 1) & (size - 1)); } head = b->conn_backlog_head; tail = READ_ONCE(b->conn_backlog_tail); if (CIRC_CNT(head, tail, size) < max) { struct rxrpc_connection *conn; conn = rxrpc_prealloc_service_connection(rxnet, gfp); if (!conn) return -ENOMEM; b->conn_backlog[head] = conn; smp_store_release(&b->conn_backlog_head, (head + 1) & (size - 1)); } /* Now it gets complicated, because calls get registered with the * socket here, with a user ID preassigned by the user. */ call = rxrpc_alloc_call(rx, gfp, debug_id); if (!call) return -ENOMEM; call->flags |= (1 << RXRPC_CALL_IS_SERVICE); rxrpc_set_call_state(call, RXRPC_CALL_SERVER_PREALLOC); __set_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), user_call_ID, rxrpc_call_new_prealloc_service); write_lock(&rx->call_lock); /* Check the user ID isn't already in use */ pp = &rx->calls.rb_node; parent = NULL; while (*pp) { parent = *pp; xcall = rb_entry(parent, struct rxrpc_call, sock_node); if (user_call_ID < xcall->user_call_ID) pp = &(*pp)->rb_left; else if (user_call_ID > xcall->user_call_ID) pp = &(*pp)->rb_right; else goto id_in_use; } call->user_call_ID = user_call_ID; call->notify_rx = notify_rx; if (user_attach_call) { rxrpc_get_call(call, rxrpc_call_get_kernel_service); user_attach_call(call, user_call_ID); } rxrpc_get_call(call, rxrpc_call_get_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); set_bit(RXRPC_CALL_HAS_USERID, &call->flags); list_add(&call->sock_link, &rx->sock_calls); write_unlock(&rx->call_lock); rxnet = call->rxnet; spin_lock(&rxnet->call_lock); list_add_tail_rcu(&call->link, &rxnet->calls); spin_unlock(&rxnet->call_lock); b->call_backlog[call_head] = call; smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1)); _leave(" = 0 [%d -> %lx]", call->debug_id, user_call_ID); return 0; id_in_use: write_unlock(&rx->call_lock); rxrpc_cleanup_call(call); _leave(" = -EBADSLT"); return -EBADSLT; } /* * Allocate the preallocation buffers for incoming service calls. These must * be charged manually. */ int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp) { struct rxrpc_backlog *b = rx->backlog; if (!b) { b = kzalloc(sizeof(struct rxrpc_backlog), gfp); if (!b) return -ENOMEM; rx->backlog = b; } return 0; } /* * Discard the preallocation on a service. */ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) { struct rxrpc_backlog *b = rx->backlog; struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); unsigned int size = RXRPC_BACKLOG_MAX, head, tail; if (!b) return; rx->backlog = NULL; /* Make sure that there aren't any incoming calls in progress before we * clear the preallocation buffers. */ spin_lock(&rx->incoming_lock); spin_unlock(&rx->incoming_lock); head = b->peer_backlog_head; tail = b->peer_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_peer *peer = b->peer_backlog[tail]; rxrpc_put_local(peer->local, rxrpc_local_put_prealloc_peer); kfree(peer); tail = (tail + 1) & (size - 1); } head = b->conn_backlog_head; tail = b->conn_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_connection *conn = b->conn_backlog[tail]; write_lock(&rxnet->conn_lock); list_del(&conn->link); list_del(&conn->proc_link); write_unlock(&rxnet->conn_lock); kfree(conn); if (atomic_dec_and_test(&rxnet->nr_conns)) wake_up_var(&rxnet->nr_conns); tail = (tail + 1) & (size - 1); } head = b->call_backlog_head; tail = b->call_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; rcu_assign_pointer(call->socket, rx); if (rx->discard_new_call) { _debug("discard %lx", call->user_call_ID); rx->discard_new_call(call, call->user_call_ID); if (call->notify_rx) call->notify_rx = rxrpc_dummy_notify; rxrpc_put_call(call, rxrpc_call_put_kernel); } rxrpc_call_completed(call); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put_discard_prealloc); tail = (tail + 1) & (size - 1); } kfree(b); } /* * Allocate a new incoming call from the prealloc pool, along with a connection * and a peer as necessary. */ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_local *local, struct rxrpc_peer *peer, struct rxrpc_connection *conn, const struct rxrpc_security *sec, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; struct rxrpc_call *call; unsigned short call_head, conn_head, peer_head; unsigned short call_tail, conn_tail, peer_tail; unsigned short call_count, conn_count; /* #calls >= #conns >= #peers must hold true. */ call_head = smp_load_acquire(&b->call_backlog_head); call_tail = b->call_backlog_tail; call_count = CIRC_CNT(call_head, call_tail, RXRPC_BACKLOG_MAX); conn_head = smp_load_acquire(&b->conn_backlog_head); conn_tail = b->conn_backlog_tail; conn_count = CIRC_CNT(conn_head, conn_tail, RXRPC_BACKLOG_MAX); ASSERTCMP(conn_count, >=, call_count); peer_head = smp_load_acquire(&b->peer_backlog_head); peer_tail = b->peer_backlog_tail; ASSERTCMP(CIRC_CNT(peer_head, peer_tail, RXRPC_BACKLOG_MAX), >=, conn_count); if (call_count == 0) return NULL; if (!conn) { if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_service_conn)) peer = NULL; if (!peer) { peer = b->peer_backlog[peer_tail]; peer->srx = *peer_srx; b->peer_backlog[peer_tail] = NULL; smp_store_release(&b->peer_backlog_tail, (peer_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); rxrpc_new_incoming_peer(local, peer); } /* Now allocate and set up the connection */ conn = b->conn_backlog[conn_tail]; b->conn_backlog[conn_tail] = NULL; smp_store_release(&b->conn_backlog_tail, (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); conn->local = rxrpc_get_local(local, rxrpc_local_get_prealloc_conn); conn->peer = peer; rxrpc_see_connection(conn, rxrpc_conn_see_new_service_conn); rxrpc_new_incoming_connection(rx, conn, sec, skb); } else { rxrpc_get_connection(conn, rxrpc_conn_get_service_conn); atomic_inc(&conn->active); } /* And now we can allocate and set up a new call */ call = b->call_backlog[call_tail]; b->call_backlog[call_tail] = NULL; smp_store_release(&b->call_backlog_tail, (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); rxrpc_see_call(call, rxrpc_call_see_accept); call->local = rxrpc_get_local(conn->local, rxrpc_local_get_call); call->conn = conn; call->security = conn->security; call->security_ix = conn->security_ix; call->peer = rxrpc_get_peer(conn->peer, rxrpc_peer_get_accept); call->dest_srx = peer->srx; call->cong_ssthresh = call->peer->cong_ssthresh; call->tx_last_sent = ktime_get_real(); return call; } /* * Set up a new incoming call. Called from the I/O thread. * * If this is for a kernel service, when we allocate the call, it will have * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the * retainer ref obtained from the backlog buffer. Prealloc calls for userspace * services only have the ref from the backlog buffer. * * If we want to report an error, we mark the skb with the packet type and * abort code and return false. */ bool rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_peer *peer, struct rxrpc_connection *conn, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb) { const struct rxrpc_security *sec = NULL; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_call *call = NULL; struct rxrpc_sock *rx; _enter(""); /* Don't set up a call for anything other than a DATA packet. */ if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA) return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call); read_lock(&local->services_lock); /* Weed out packets to services we're not offering. Packets that would * begin a call are explicitly rejected and the rest are just * discarded. */ rx = local->service; if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && sp->hdr.serviceId != rx->second_service) ) { if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && sp->hdr.seq == 1) goto unsupported_service; goto discard; } if (!conn) { sec = rxrpc_get_incoming_security(rx, skb); if (!sec) goto unsupported_security; } spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { rxrpc_direct_abort(skb, rxrpc_abort_shut_down, RX_INVALID_OPERATION, -ESHUTDOWN); goto no_call; } call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, peer_srx, skb); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; goto no_call; } trace_rxrpc_receive(call, rxrpc_receive_incoming, sp->hdr.serial, sp->hdr.seq); /* Make the call live. */ rxrpc_incoming_call(rx, call, skb); conn = call->conn; if (rx->notify_new_call) rx->notify_new_call(&rx->sk, call, call->user_call_ID); spin_lock(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) { conn->state = RXRPC_CONN_SERVICE_CHALLENGING; set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); rxrpc_queue_conn(call->conn, rxrpc_conn_queue_challenge); } spin_unlock(&conn->state_lock); spin_unlock(&rx->incoming_lock); read_unlock(&local->services_lock); if (hlist_unhashed(&call->error_link)) { spin_lock(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); spin_unlock(&call->peer->lock); } _leave(" = %p{%d}", call, call->debug_id); rxrpc_input_call_event(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); return true; unsupported_service: read_unlock(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EOPNOTSUPP); unsupported_security: read_unlock(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EKEYREJECTED); no_call: spin_unlock(&rx->incoming_lock); read_unlock(&local->services_lock); _leave(" = f [%u]", skb->mark); return false; discard: read_unlock(&local->services_lock); return true; } /* * Charge up socket with preallocated calls, attaching user call IDs. */ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) { struct rxrpc_backlog *b = rx->backlog; if (rx->sk.sk_state == RXRPC_CLOSE) return -ESHUTDOWN; return rxrpc_service_prealloc_one(rx, b, NULL, NULL, user_call_ID, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); } /* * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls * @sock: The socket on which to preallocate * @notify_rx: Event notification function for the call * @user_attach_call: Func to attach call to user_call_ID * @user_call_ID: The tag to attach to the preallocated call * @gfp: The allocation conditions. * @debug_id: The tracing debug ID. * * Charge up the socket with preallocated calls, each with a user ID. A * function should be provided to effect the attachment from the user's side. * The user is given a ref to hold on the call. * * Note that the call may be come connected before this function returns. */ int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, rxrpc_user_attach_call_t user_attach_call, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct rxrpc_backlog *b = rx->backlog; if (sock->sk->sk_state == RXRPC_CLOSE) return -ESHUTDOWN; return rxrpc_service_prealloc_one(rx, b, notify_rx, user_attach_call, user_call_ID, gfp, debug_id); } EXPORT_SYMBOL(rxrpc_kernel_charge_accept);
1 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FUTEX_H #define _FUTEX_H #include <linux/futex.h> #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> #include <linux/compat.h> #include <linux/uaccess.h> #ifdef CONFIG_PREEMPT_RT #include <linux/rcuwait.h> #endif #include <asm/futex.h> /* * Futex flags used to encode options to functions and preserve them across * restarts. */ #define FLAGS_SIZE_8 0x0000 #define FLAGS_SIZE_16 0x0001 #define FLAGS_SIZE_32 0x0002 #define FLAGS_SIZE_64 0x0003 #define FLAGS_SIZE_MASK 0x0003 #ifdef CONFIG_MMU # define FLAGS_SHARED 0x0010 #else /* * NOMMU does not have per process address space. Let the compiler optimize * code away. */ # define FLAGS_SHARED 0x0000 #endif #define FLAGS_CLOCKRT 0x0020 #define FLAGS_HAS_TIMEOUT 0x0040 #define FLAGS_NUMA 0x0080 #define FLAGS_STRICT 0x0100 /* FUTEX_ to FLAGS_ */ static inline unsigned int futex_to_flags(unsigned int op) { unsigned int flags = FLAGS_SIZE_32; if (!(op & FUTEX_PRIVATE_FLAG)) flags |= FLAGS_SHARED; if (op & FUTEX_CLOCK_REALTIME) flags |= FLAGS_CLOCKRT; return flags; } #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE) /* FUTEX2_ to FLAGS_ */ static inline unsigned int futex2_to_flags(unsigned int flags2) { unsigned int flags = flags2 & FUTEX2_SIZE_MASK; if (!(flags2 & FUTEX2_PRIVATE)) flags |= FLAGS_SHARED; if (flags2 & FUTEX2_NUMA) flags |= FLAGS_NUMA; return flags; } static inline unsigned int futex_size(unsigned int flags) { return 1 << (flags & FLAGS_SIZE_MASK); } static inline bool futex_flags_valid(unsigned int flags) { /* Only 64bit futexes for 64bit code */ if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) { if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64) return false; } /* Only 32bit futexes are implemented -- for now */ if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) return false; return true; } static inline bool futex_validate_input(unsigned int flags, u64 val) { int bits = 8 * futex_size(flags); if (bits < 64 && (val >> bits)) return false; return true; } #ifdef CONFIG_FAIL_FUTEX extern bool should_fail_futex(bool fshared); #else static inline bool should_fail_futex(bool fshared) { return false; } #endif /* * Hash buckets are shared by all the futex_keys that hash to the same * location. Each key may have multiple futex_q structures, one for each task * waiting on a futex. */ struct futex_hash_bucket { atomic_t waiters; spinlock_t lock; struct plist_head chain; } ____cacheline_aligned_in_smp; /* * Priority Inheritance state: */ struct futex_pi_state { /* * list of 'owned' pi_state instances - these have to be * cleaned up in do_exit() if the task exits prematurely: */ struct list_head list; /* * The PI object: */ struct rt_mutex_base pi_mutex; struct task_struct *owner; refcount_t refcount; union futex_key key; } __randomize_layout; struct futex_q; typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q); /** * struct futex_q - The hashed futex queue entry, one per waiting task * @list: priority-sorted list of tasks waiting on this futex * @task: the task waiting on the futex * @lock_ptr: the hash bucket lock * @wake: the wake handler for this queue * @wake_data: data associated with the wake handler * @key: the key the futex is hashed on * @pi_state: optional priority inheritance state * @rt_waiter: rt_waiter storage for use with requeue_pi * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * @requeue_state: State field for futex_requeue_pi() * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakeup is always to make the first condition true, then * the second. * * PI futexes are typically woken before they are removed from the hash list via * the rt_mutex code. See futex_unqueue_pi(). */ struct futex_q { struct plist_node list; struct task_struct *task; spinlock_t *lock_ptr; futex_wake_fn *wake; void *wake_data; union futex_key key; struct futex_pi_state *pi_state; struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; atomic_t requeue_state; #ifdef CONFIG_PREEMPT_RT struct rcuwait requeue_wait; #endif } __randomize_layout; extern const struct futex_q futex_q_init; enum futex_access { FUTEX_READ, FUTEX_WRITE }; extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); extern struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns); extern struct futex_hash_bucket *futex_hash(union futex_key *key); /** * futex_match - Check whether two futex keys are equal * @key1: Pointer to key1 * @key2: Pointer to key2 * * Return 1 if two futex_keys are equal, 0 otherwise. */ static inline int futex_match(union futex_key *key1, union futex_key *key2) { return (key1 && key2 && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb); extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout); extern bool __futex_wake_mark(struct futex_q *q); extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); extern int fault_in_user_writeable(u32 __user *uaddr); extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key); static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) { int ret; pagefault_disable(); ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); return ret; } /* * This does a plain atomic user space read, and the user pointer has * already been verified earlier by get_futex_key() to be both aligned * and actually in user space, just like futex_atomic_cmpxchg_inatomic(). * * We still want to avoid any speculation, and while __get_user() is * the traditional model for this, it's actually slower than doing * this manually these days. * * We could just have a per-architecture special function for it, * the same way we do futex_atomic_cmpxchg_inatomic(), but rather * than force everybody to do that, write it out long-hand using * the low-level user-access infrastructure. * * This looks a bit overkill, but generally just results in a couple * of instructions. */ static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) { u32 val; if (can_do_masked_user_access()) from = masked_user_access_begin(from); else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(val, from, Efault); user_read_access_end(); *dest = val; return 0; Efault: user_read_access_end(); return -EFAULT; } static inline int futex_get_value_locked(u32 *dest, u32 __user *from) { int ret; pagefault_disable(); ret = futex_read_inatomic(dest, from); pagefault_enable(); return ret; } extern void __futex_unqueue(struct futex_q *q); extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb); extern int futex_unqueue(struct futex_q *q); /** * futex_queue() - Enqueue the futex_q on the futex_hash_bucket * @q: The futex_q to enqueue * @hb: The destination hash bucket * * The hb->lock must be held by the caller, and is released here. A call to * futex_queue() is typically paired with exactly one call to futex_unqueue(). The * exceptions involve the PI related operations, which may use futex_unqueue_pi() * or nothing if the unqueue is done as part of the wake process and the unqueue * state is implicit in the state of woken task (see futex_wait_requeue_pi() for * an example). */ static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) __releases(&hb->lock) { __futex_queue(q, hb); spin_unlock(&hb->lock); } extern void futex_unqueue_pi(struct futex_q *q); extern void wait_for_owner_exiting(int ret, struct task_struct *exiting); /* * Reflects a new waiter being added to the waitqueue. */ static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP atomic_inc(&hb->waiters); /* * Full barrier (A), see the ordering comment above. */ smp_mb__after_atomic(); #endif } /* * Reflects a waiter being removed from the waitqueue by wakeup * paths. */ static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP atomic_dec(&hb->waiters); #endif } static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP /* * Full barrier (B), see the ordering comment above. */ smp_mb(); return atomic_read(&hb->waiters); #else return 1; #endif } extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q); extern void futex_q_unlock(struct futex_hash_bucket *hb); extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps, struct task_struct *task, struct task_struct **exiting, int set_waiters); extern int refill_pi_state_cache(void); extern void get_pi_state(struct futex_pi_state *pi_state); extern void put_pi_state(struct futex_pi_state *pi_state); extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked); /* * Express the locking dependencies for lockdep: */ static inline void double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { if (hb1 > hb2) swap(hb1, hb2); spin_lock(&hb1->lock); if (hb1 != hb2) spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); } static inline void double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { spin_unlock(&hb1->lock); if (hb1 != hb2) spin_unlock(&hb2->lock); } /* syscalls */ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2); extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1, u32 __user *uaddr2, unsigned int flags2, int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi); extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset); extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset); /** * struct futex_vector - Auxiliary struct for futex_waitv() * @w: Userspace provided data * @q: Kernel side data * * Struct used to build an array with all data need for futex_waitv() */ struct futex_vector { struct futex_waitv w; struct futex_q q; }; extern int futex_parse_waitv(struct futex_vector *futexv, struct futex_waitv __user *uwaitv, unsigned int nr_futexes, futex_wake_fn *wake, void *wake_data); extern int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken); extern int futex_unqueue_multiple(struct futex_vector *v, int count); extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to); extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset); extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op); extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags); extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock); #endif /* _FUTEX_H */
142 1119 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef MPLS_INTERNAL_H #define MPLS_INTERNAL_H #include <net/mpls.h> /* put a reasonable limit on the number of labels * we will accept from userspace */ #define MAX_NEW_LABELS 30 struct mpls_entry_decoded { u32 label; u8 ttl; u8 tc; u8 bos; }; struct mpls_pcpu_stats { struct mpls_link_stats stats; struct u64_stats_sync syncp; }; struct mpls_dev { int input_enabled; struct net_device *dev; struct mpls_pcpu_stats __percpu *stats; struct ctl_table_header *sysctl; struct rcu_head rcu; }; #if BITS_PER_LONG == 32 #define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \ do { \ __typeof__(*(mdev)->stats) *ptr = \ raw_cpu_ptr((mdev)->stats); \ local_bh_disable(); \ u64_stats_update_begin(&ptr->syncp); \ ptr->stats.pkts_field++; \ ptr->stats.bytes_field += (len); \ u64_stats_update_end(&ptr->syncp); \ local_bh_enable(); \ } while (0) #define MPLS_INC_STATS(mdev, field) \ do { \ __typeof__(*(mdev)->stats) *ptr = \ raw_cpu_ptr((mdev)->stats); \ local_bh_disable(); \ u64_stats_update_begin(&ptr->syncp); \ ptr->stats.field++; \ u64_stats_update_end(&ptr->syncp); \ local_bh_enable(); \ } while (0) #else #define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \ do { \ this_cpu_inc((mdev)->stats->stats.pkts_field); \ this_cpu_add((mdev)->stats->stats.bytes_field, (len)); \ } while (0) #define MPLS_INC_STATS(mdev, field) \ this_cpu_inc((mdev)->stats->stats.field) #endif struct sk_buff; #define LABEL_NOT_SPECIFIED (1 << 20) /* This maximum ha length copied from the definition of struct neighbour */ #define VIA_ALEN_ALIGN sizeof(unsigned long) #define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, VIA_ALEN_ALIGN)) enum mpls_payload_type { MPT_UNSPEC, /* IPv4 or IPv6 */ MPT_IPV4 = 4, MPT_IPV6 = 6, /* Other types not implemented: * - Pseudo-wire with or without control word (RFC4385) * - GAL (RFC5586) */ }; struct mpls_nh { /* next hop label forwarding entry */ struct net_device *nh_dev; /* nh_flags is accessed under RCU in the packet path; it is * modified handling netdev events with rtnl lock held */ unsigned int nh_flags; u8 nh_labels; u8 nh_via_alen; u8 nh_via_table; u8 nh_reserved1; u32 nh_label[]; }; /* offset of via from beginning of mpls_nh */ #define MPLS_NH_VIA_OFF(num_labels) \ ALIGN(sizeof(struct mpls_nh) + (num_labels) * sizeof(u32), \ VIA_ALEN_ALIGN) /* all nexthops within a route have the same size based on the * max number of labels and max via length across all nexthops */ #define MPLS_NH_SIZE(num_labels, max_via_alen) \ (MPLS_NH_VIA_OFF((num_labels)) + \ ALIGN((max_via_alen), VIA_ALEN_ALIGN)) enum mpls_ttl_propagation { MPLS_TTL_PROP_DEFAULT, MPLS_TTL_PROP_ENABLED, MPLS_TTL_PROP_DISABLED, }; /* The route, nexthops and vias are stored together in the same memory * block: * * +----------------------+ * | mpls_route | * +----------------------+ * | mpls_nh 0 | * +----------------------+ * | alignment padding | 4 bytes for odd number of labels * +----------------------+ * | via[rt_max_alen] 0 | * +----------------------+ * | alignment padding | via's aligned on sizeof(unsigned long) * +----------------------+ * | ... | * +----------------------+ * | mpls_nh n-1 | * +----------------------+ * | via[rt_max_alen] n-1 | * +----------------------+ */ struct mpls_route { /* next hop label forwarding entry */ struct rcu_head rt_rcu; u8 rt_protocol; u8 rt_payload_type; u8 rt_max_alen; u8 rt_ttl_propagate; u8 rt_nhn; /* rt_nhn_alive is accessed under RCU in the packet path; it * is modified handling netdev events with rtnl lock held */ u8 rt_nhn_alive; u8 rt_nh_size; u8 rt_via_offset; u8 rt_reserved1; struct mpls_nh rt_nh[]; }; #define for_nexthops(rt) { \ int nhsel; const struct mpls_nh *nh; \ for (nhsel = 0, nh = (rt)->rt_nh; \ nhsel < (rt)->rt_nhn; \ nh = (void *)nh + (rt)->rt_nh_size, nhsel++) #define change_nexthops(rt) { \ int nhsel; struct mpls_nh *nh; \ for (nhsel = 0, nh = (rt)->rt_nh; \ nhsel < (rt)->rt_nhn; \ nh = (void *)nh + (rt)->rt_nh_size, nhsel++) #define endfor_nexthops(rt) } static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr) { struct mpls_entry_decoded result; unsigned entry = be32_to_cpu(hdr->label_stack_entry); result.label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; result.ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; result.tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; result.bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; return result; } static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) { return rcu_dereference_rtnl(dev->mpls_ptr); } int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, u32 label[], struct netlink_ext_ack *extack); bool mpls_output_possible(const struct net_device *dev); unsigned int mpls_dev_mtu(const struct net_device *dev); bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); void mpls_stats_inc_outucastpkts(struct net_device *dev, const struct sk_buff *skb); #endif /* MPLS_INTERNAL_H */
6 1 14 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_GSO_H #define _NET_GSO_H #include <linux/skbuff.h> /* Keeps track of mac header offset relative to skb->head. * It is useful for TSO of Tunneling protocol. e.g. GRE. * For non-tunnel skb it points to skb_mac_header() and for * tunnel skb it points to outer mac header. * Keeps track of level of encapsulation of network headers. */ struct skb_gso_cb { union { int mac_offset; int data_offset; }; int encap_level; __wsum csum; __u16 csum_start; }; #define SKB_GSO_CB_OFFSET 32 #define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET)) static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) { return (skb_mac_header(inner_skb) - inner_skb->head) - SKB_GSO_CB(inner_skb)->mac_offset; } static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) { int new_headroom, headroom; int ret; headroom = skb_headroom(skb); ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); if (ret) return ret; new_headroom = skb_headroom(skb); SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); return 0; } static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res) { /* Do not update partial checksums if remote checksum is enabled. */ if (skb->remcsum_offload) return; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head; } /* Compute the checksum for a gso segment. First compute the checksum value * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and * then add in skb->csum (checksum from csum_start to end of packet). * skb->csum and csum_start are then updated to reflect the checksum of the * resultant packet starting from the transport header-- the resultant checksum * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo * header. */ static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) { unsigned char *csum_start = skb_transport_header(skb); int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start; __wsum partial = SKB_GSO_CB(skb)->csum; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = csum_start - skb->head; return csum_fold(csum_partial(csum_start, plen, partial)); } struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) { return __skb_gso_segment(skb, features, true); } struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) { skb->protocol = protocol; skb->encapsulation = 1; skb_push(skb, pulled_hlen); skb_reset_transport_header(skb); skb->mac_header = mac_offset; skb->network_header = skb->mac_header + mac_len; skb->mac_len = mac_len; } #endif /* _NET_GSO_H */
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */ #include <crypto/aead.h> #include <linux/debugfs.h> #include <net/xfrm.h> #include "netdevsim.h" #define NSIM_IPSEC_AUTH_BITS 128 static ssize_t nsim_dbg_netdev_ops_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct netdevsim *ns = filp->private_data; struct nsim_ipsec *ipsec = &ns->ipsec; size_t bufsize; char *buf, *p; int len; int i; /* the buffer needed is * (num SAs * 3 lines each * ~60 bytes per line) + one more line */ bufsize = (ipsec->count * 4 * 60) + 60; buf = kzalloc(bufsize, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, bufsize - (p - buf), "SA count=%u tx=%u\n", ipsec->count, ipsec->tx); for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) { struct nsim_sa *sap = &ipsec->sa[i]; if (!sap->used) continue; if (sap->xs->props.family == AF_INET6) p += scnprintf(p, bufsize - (p - buf), "sa[%i] %cx ipaddr=%pI6c\n", i, (sap->rx ? 'r' : 't'), &sap->ipaddr); else p += scnprintf(p, bufsize - (p - buf), "sa[%i] %cx ipaddr=%pI4\n", i, (sap->rx ? 'r' : 't'), &sap->ipaddr[3]); p += scnprintf(p, bufsize - (p - buf), "sa[%i] spi=0x%08x proto=0x%x salt=0x%08x crypt=%d\n", i, be32_to_cpu(sap->xs->id.spi), sap->xs->id.proto, sap->salt, sap->crypt); p += scnprintf(p, bufsize - (p - buf), "sa[%i] key=0x%08x %08x %08x %08x\n", i, sap->key[0], sap->key[1], sap->key[2], sap->key[3]); } len = simple_read_from_buffer(buffer, count, ppos, buf, p - buf); kfree(buf); return len; } static const struct file_operations ipsec_dbg_fops = { .owner = THIS_MODULE, .open = simple_open, .read = nsim_dbg_netdev_ops_read, }; static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec) { u32 i; if (ipsec->count == NSIM_IPSEC_MAX_SA_COUNT) return -ENOSPC; /* search sa table */ for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) { if (!ipsec->sa[i].used) return i; } return -ENOSPC; } static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs, u32 *mykey, u32 *mysalt) { const char aes_gcm_name[] = "rfc4106(gcm(aes))"; struct net_device *dev = xs->xso.real_dev; unsigned char *key_data; char *alg_name = NULL; int key_len; if (!xs->aead) { netdev_err(dev, "Unsupported IPsec algorithm\n"); return -EINVAL; } if (xs->aead->alg_icv_len != NSIM_IPSEC_AUTH_BITS) { netdev_err(dev, "IPsec offload requires %d bit authentication\n", NSIM_IPSEC_AUTH_BITS); return -EINVAL; } key_data = &xs->aead->alg_key[0]; key_len = xs->aead->alg_key_len; alg_name = xs->aead->alg_name; if (strcmp(alg_name, aes_gcm_name)) { netdev_err(dev, "Unsupported IPsec algorithm - please use %s\n", aes_gcm_name); return -EINVAL; } /* 160 accounts for 16 byte key and 4 byte salt */ if (key_len > NSIM_IPSEC_AUTH_BITS) { *mysalt = ((u32 *)key_data)[4]; } else if (key_len == NSIM_IPSEC_AUTH_BITS) { *mysalt = 0; } else { netdev_err(dev, "IPsec hw offload only supports 128 bit keys with optional 32 bit salt\n"); return -EINVAL; } memcpy(mykey, key_data, 16); return 0; } static int nsim_ipsec_add_sa(struct xfrm_state *xs, struct netlink_ext_ack *extack) { struct nsim_ipsec *ipsec; struct net_device *dev; struct netdevsim *ns; struct nsim_sa sa; u16 sa_idx; int ret; dev = xs->xso.real_dev; ns = netdev_priv(dev); ipsec = &ns->ipsec; if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) { NL_SET_ERR_MSG_MOD(extack, "Unsupported protocol for ipsec offload"); return -EINVAL; } if (xs->calg) { NL_SET_ERR_MSG_MOD(extack, "Compression offload not supported"); return -EINVAL; } if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { NL_SET_ERR_MSG_MOD(extack, "Unsupported ipsec offload type"); return -EINVAL; } /* find the first unused index */ ret = nsim_ipsec_find_empty_idx(ipsec); if (ret < 0) { NL_SET_ERR_MSG_MOD(extack, "No space for SA in Rx table!"); return ret; } sa_idx = (u16)ret; memset(&sa, 0, sizeof(sa)); sa.used = true; sa.xs = xs; if (sa.xs->id.proto & IPPROTO_ESP) sa.crypt = xs->ealg || xs->aead; /* get the key and salt */ ret = nsim_ipsec_parse_proto_keys(xs, sa.key, &sa.salt); if (ret) { NL_SET_ERR_MSG_MOD(extack, "Failed to get key data for SA table"); return ret; } if (xs->xso.dir == XFRM_DEV_OFFLOAD_IN) sa.rx = true; if (xs->props.family == AF_INET6) memcpy(sa.ipaddr, &xs->id.daddr.a6, 16); else memcpy(&sa.ipaddr[3], &xs->id.daddr.a4, 4); /* the preparations worked, so save the info */ memcpy(&ipsec->sa[sa_idx], &sa, sizeof(sa)); /* the XFRM stack doesn't like offload_handle == 0, * so add a bitflag in case our array index is 0 */ xs->xso.offload_handle = sa_idx | NSIM_IPSEC_VALID; ipsec->count++; return 0; } static void nsim_ipsec_del_sa(struct xfrm_state *xs) { struct netdevsim *ns = netdev_priv(xs->xso.real_dev); struct nsim_ipsec *ipsec = &ns->ipsec; u16 sa_idx; sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID; if (!ipsec->sa[sa_idx].used) { netdev_err(ns->netdev, "Invalid SA for delete sa_idx=%d\n", sa_idx); return; } memset(&ipsec->sa[sa_idx], 0, sizeof(struct nsim_sa)); ipsec->count--; } static bool nsim_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs) { struct netdevsim *ns = netdev_priv(xs->xso.real_dev); struct nsim_ipsec *ipsec = &ns->ipsec; ipsec->ok++; return true; } static const struct xfrmdev_ops nsim_xfrmdev_ops = { .xdo_dev_state_add = nsim_ipsec_add_sa, .xdo_dev_state_delete = nsim_ipsec_del_sa, .xdo_dev_offload_ok = nsim_ipsec_offload_ok, }; bool nsim_ipsec_tx(struct netdevsim *ns, struct sk_buff *skb) { struct sec_path *sp = skb_sec_path(skb); struct nsim_ipsec *ipsec = &ns->ipsec; struct xfrm_state *xs; struct nsim_sa *tsa; u32 sa_idx; /* do we even need to check this packet? */ if (!sp) return true; if (unlikely(!sp->len)) { netdev_err(ns->netdev, "no xfrm state len = %d\n", sp->len); return false; } xs = xfrm_input_state(skb); if (unlikely(!xs)) { netdev_err(ns->netdev, "no xfrm_input_state() xs = %p\n", xs); return false; } sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID; if (unlikely(sa_idx >= NSIM_IPSEC_MAX_SA_COUNT)) { netdev_err(ns->netdev, "bad sa_idx=%d max=%d\n", sa_idx, NSIM_IPSEC_MAX_SA_COUNT); return false; } tsa = &ipsec->sa[sa_idx]; if (unlikely(!tsa->used)) { netdev_err(ns->netdev, "unused sa_idx=%d\n", sa_idx); return false; } if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) { netdev_err(ns->netdev, "unexpected proto=%d\n", xs->id.proto); return false; } ipsec->tx++; return true; } void nsim_ipsec_init(struct netdevsim *ns) { ns->netdev->xfrmdev_ops = &nsim_xfrmdev_ops; #define NSIM_ESP_FEATURES (NETIF_F_HW_ESP | \ NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) ns->netdev->features |= NSIM_ESP_FEATURES; ns->netdev->hw_enc_features |= NSIM_ESP_FEATURES; ns->ipsec.pfile = debugfs_create_file("ipsec", 0400, ns->nsim_dev_port->ddir, ns, &ipsec_dbg_fops); } void nsim_ipsec_teardown(struct netdevsim *ns) { struct nsim_ipsec *ipsec = &ns->ipsec; if (ipsec->count) netdev_err(ns->netdev, "tearing down IPsec offload with %d SAs left\n", ipsec->count); debugfs_remove_recursive(ipsec->pfile); }
8 8 8 10 10 10 21 21 21 21 21 21 21 10 10 10 10 21 21 41 29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 MLME SAP interface * * Copyright (c) 2009, Jouni Malinen <j@w1.fi> * Copyright (c) 2015 Intel Deutschland GmbH * Copyright (C) 2019-2020, 2022-2024 Intel Corporation */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/nl80211.h> #include <linux/slab.h> #include <linux/wireless.h> #include <net/cfg80211.h> #include <net/iw_handler.h> #include "core.h" #include "nl80211.h" #include "rdev-ops.h" void cfg80211_rx_assoc_resp(struct net_device *dev, const struct cfg80211_rx_assoc_resp_data *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)data->buf; struct cfg80211_connect_resp_params cr = { .timeout_reason = NL80211_TIMEOUT_UNSPECIFIED, .req_ie = data->req_ies, .req_ie_len = data->req_ies_len, .resp_ie = mgmt->u.assoc_resp.variable, .resp_ie_len = data->len - offsetof(struct ieee80211_mgmt, u.assoc_resp.variable), .status = le16_to_cpu(mgmt->u.assoc_resp.status_code), .ap_mld_addr = data->ap_mld_addr, }; unsigned int link_id; for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { cr.links[link_id].status = data->links[link_id].status; cr.links[link_id].bss = data->links[link_id].bss; WARN_ON_ONCE(cr.links[link_id].status != WLAN_STATUS_SUCCESS && (!cr.ap_mld_addr || !cr.links[link_id].bss)); if (!cr.links[link_id].bss) continue; cr.links[link_id].bssid = data->links[link_id].bss->bssid; cr.links[link_id].addr = data->links[link_id].addr; /* need to have local link addresses for MLO connections */ WARN_ON(cr.ap_mld_addr && !is_valid_ether_addr(cr.links[link_id].addr)); BUG_ON(!cr.links[link_id].bss->channel); if (cr.links[link_id].bss->channel->band == NL80211_BAND_S1GHZ) { WARN_ON(link_id); cr.resp_ie = (u8 *)&mgmt->u.s1g_assoc_resp.variable; cr.resp_ie_len = data->len - offsetof(struct ieee80211_mgmt, u.s1g_assoc_resp.variable); } if (cr.ap_mld_addr) cr.valid_links |= BIT(link_id); } trace_cfg80211_send_rx_assoc(dev, data); /* * This is a bit of a hack, we don't notify userspace of * a (re-)association reply if we tried to send a reassoc * and got a reject -- we only try again with an assoc * frame instead of reassoc. */ if (cfg80211_sme_rx_assoc_resp(wdev, cr.status)) { for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { struct cfg80211_bss *bss = data->links[link_id].bss; if (!bss) continue; cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wiphy, bss); } return; } nl80211_send_rx_assoc(rdev, dev, data); /* update current_bss etc., consumes the bss reference */ __cfg80211_connect_result(dev, &cr, cr.status == WLAN_STATUS_SUCCESS); } EXPORT_SYMBOL(cfg80211_rx_assoc_resp); static void cfg80211_process_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); nl80211_send_rx_auth(rdev, wdev->netdev, buf, len, GFP_KERNEL); cfg80211_sme_rx_auth(wdev, buf, len); } static void cfg80211_process_deauth(struct wireless_dev *wdev, const u8 *buf, size_t len, bool reconnect) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr); nl80211_send_deauth(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL); if (!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid)) return; __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap); cfg80211_sme_deauth(wdev); } static void cfg80211_process_disassoc(struct wireless_dev *wdev, const u8 *buf, size_t len, bool reconnect) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; u16 reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr); nl80211_send_disassoc(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL); if (WARN_ON(!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid))) return; __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap); cfg80211_sme_disassoc(wdev); } void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_mgmt *mgmt = (void *)buf; lockdep_assert_wiphy(wdev->wiphy); trace_cfg80211_rx_mlme_mgmt(dev, buf, len); if (WARN_ON(len < 2)) return; if (ieee80211_is_auth(mgmt->frame_control)) cfg80211_process_auth(wdev, buf, len); else if (ieee80211_is_deauth(mgmt->frame_control)) cfg80211_process_deauth(wdev, buf, len, false); else if (ieee80211_is_disassoc(mgmt->frame_control)) cfg80211_process_disassoc(wdev, buf, len, false); } EXPORT_SYMBOL(cfg80211_rx_mlme_mgmt); void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_send_auth_timeout(dev, addr); nl80211_send_auth_timeout(rdev, dev, addr, GFP_KERNEL); cfg80211_sme_auth_timeout(wdev); } EXPORT_SYMBOL(cfg80211_auth_timeout); void cfg80211_assoc_failure(struct net_device *dev, struct cfg80211_assoc_failure *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); const u8 *addr = data->ap_mld_addr ?: data->bss[0]->bssid; int i; trace_cfg80211_send_assoc_failure(dev, data); if (data->timeout) { nl80211_send_assoc_timeout(rdev, dev, addr, GFP_KERNEL); cfg80211_sme_assoc_timeout(wdev); } else { cfg80211_sme_abandon_assoc(wdev); } for (i = 0; i < ARRAY_SIZE(data->bss); i++) { struct cfg80211_bss *bss = data->bss[i]; if (!bss) continue; cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wiphy, bss); } } EXPORT_SYMBOL(cfg80211_assoc_failure); void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len, bool reconnect) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_mgmt *mgmt = (void *)buf; lockdep_assert_wiphy(wdev->wiphy); trace_cfg80211_tx_mlme_mgmt(dev, buf, len, reconnect); if (WARN_ON(len < 2)) return; if (ieee80211_is_deauth(mgmt->frame_control)) cfg80211_process_deauth(wdev, buf, len, reconnect); else cfg80211_process_disassoc(wdev, buf, len, reconnect); } EXPORT_SYMBOL(cfg80211_tx_mlme_mgmt); void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr, enum nl80211_key_type key_type, int key_id, const u8 *tsc, gfp_t gfp) { struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; char *buf = kmalloc(128, gfp); if (buf) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = sprintf(buf, "MLME-MICHAELMICFAILURE." "indication(keyid=%d %scast addr=%pM)", key_id, key_type == NL80211_KEYTYPE_GROUP ? "broad" : "uni", addr); wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf); kfree(buf); } #endif trace_cfg80211_michael_mic_failure(dev, addr, key_type, key_id, tsc); nl80211_michael_mic_failure(rdev, dev, addr, key_type, key_id, tsc, gfp); } EXPORT_SYMBOL(cfg80211_michael_mic_failure); /* some MLME handling for userspace SME */ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req) { struct wireless_dev *wdev = dev->ieee80211_ptr; lockdep_assert_wiphy(wdev->wiphy); if (!req->bss) return -ENOENT; if (req->link_id >= 0 && !(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO)) return -EINVAL; if (req->auth_type == NL80211_AUTHTYPE_SHARED_KEY) { if (!req->key || !req->key_len || req->key_idx < 0 || req->key_idx > 3) return -EINVAL; } if (wdev->connected && ether_addr_equal(req->bss->bssid, wdev->u.client.connected_addr)) return -EALREADY; if (ether_addr_equal(req->bss->bssid, dev->dev_addr) || (req->link_id >= 0 && ether_addr_equal(req->ap_mld_addr, dev->dev_addr))) return -EINVAL; return rdev_auth(rdev, dev, req); } /* Do a logical ht_capa &= ht_capa_mask. */ void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa, const struct ieee80211_ht_cap *ht_capa_mask) { int i; u8 *p1, *p2; if (!ht_capa_mask) { memset(ht_capa, 0, sizeof(*ht_capa)); return; } p1 = (u8*)(ht_capa); p2 = (u8*)(ht_capa_mask); for (i = 0; i < sizeof(*ht_capa); i++) p1[i] &= p2[i]; } /* Do a logical vht_capa &= vht_capa_mask. */ void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, const struct ieee80211_vht_cap *vht_capa_mask) { int i; u8 *p1, *p2; if (!vht_capa_mask) { memset(vht_capa, 0, sizeof(*vht_capa)); return; } p1 = (u8*)(vht_capa); p2 = (u8*)(vht_capa_mask); for (i = 0; i < sizeof(*vht_capa); i++) p1[i] &= p2[i]; } static int cfg80211_mlme_check_mlo_compat(const struct ieee80211_multi_link_elem *mle_a, const struct ieee80211_multi_link_elem *mle_b, struct netlink_ext_ack *extack) { const struct ieee80211_mle_basic_common_info *common_a, *common_b; common_a = (const void *)mle_a->variable; common_b = (const void *)mle_b->variable; if (memcmp(common_a->mld_mac_addr, common_b->mld_mac_addr, ETH_ALEN)) { NL_SET_ERR_MSG(extack, "AP MLD address mismatch"); return -EINVAL; } if (ieee80211_mle_get_eml_cap((const u8 *)mle_a) != ieee80211_mle_get_eml_cap((const u8 *)mle_b)) { NL_SET_ERR_MSG(extack, "link EML capabilities mismatch"); return -EINVAL; } if (ieee80211_mle_get_mld_capa_op((const u8 *)mle_a) != ieee80211_mle_get_mld_capa_op((const u8 *)mle_b)) { NL_SET_ERR_MSG(extack, "link MLD capabilities/ops mismatch"); return -EINVAL; } return 0; } static int cfg80211_mlme_check_mlo(struct net_device *dev, struct cfg80211_assoc_request *req, struct netlink_ext_ack *extack) { const struct ieee80211_multi_link_elem *mles[ARRAY_SIZE(req->links)] = {}; int i; if (req->link_id < 0) return 0; if (!req->links[req->link_id].bss) { NL_SET_ERR_MSG(extack, "no BSS for assoc link"); return -EINVAL; } rcu_read_lock(); for (i = 0; i < ARRAY_SIZE(req->links); i++) { const struct cfg80211_bss_ies *ies; const struct element *ml; if (!req->links[i].bss) continue; if (ether_addr_equal(req->links[i].bss->bssid, dev->dev_addr)) { NL_SET_ERR_MSG(extack, "BSSID must not be our address"); req->links[i].error = -EINVAL; goto error; } ies = rcu_dereference(req->links[i].bss->ies); ml = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK, ies->data, ies->len); if (!ml) { NL_SET_ERR_MSG(extack, "MLO BSS w/o ML element"); req->links[i].error = -EINVAL; goto error; } if (!ieee80211_mle_type_ok(ml->data + 1, IEEE80211_ML_CONTROL_TYPE_BASIC, ml->datalen - 1)) { NL_SET_ERR_MSG(extack, "BSS with invalid ML element"); req->links[i].error = -EINVAL; goto error; } mles[i] = (const void *)(ml->data + 1); if (ieee80211_mle_get_link_id((const u8 *)mles[i]) != i) { NL_SET_ERR_MSG(extack, "link ID mismatch"); req->links[i].error = -EINVAL; goto error; } } if (WARN_ON(!mles[req->link_id])) goto error; for (i = 0; i < ARRAY_SIZE(req->links); i++) { if (i == req->link_id || !req->links[i].bss) continue; if (WARN_ON(!mles[i])) goto error; if (cfg80211_mlme_check_mlo_compat(mles[req->link_id], mles[i], extack)) { req->links[i].error = -EINVAL; goto error; } } rcu_read_unlock(); return 0; error: rcu_read_unlock(); return -EINVAL; } /* Note: caller must cfg80211_put_bss() regardless of result */ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req, struct netlink_ext_ack *extack) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); err = cfg80211_mlme_check_mlo(dev, req, extack); if (err) return err; if (wdev->connected && (!req->prev_bssid || !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid))) return -EALREADY; if ((req->bss && ether_addr_equal(req->bss->bssid, dev->dev_addr)) || (req->link_id >= 0 && ether_addr_equal(req->ap_mld_addr, dev->dev_addr))) return -EINVAL; cfg80211_oper_and_ht_capa(&req->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); cfg80211_oper_and_vht_capa(&req->vht_capa_mask, rdev->wiphy.vht_capa_mod_mask); err = rdev_assoc(rdev, dev, req); if (!err) { int link_id; if (req->bss) { cfg80211_ref_bss(&rdev->wiphy, req->bss); cfg80211_hold_bss(bss_from_pub(req->bss)); } for (link_id = 0; link_id < ARRAY_SIZE(req->links); link_id++) { if (!req->links[link_id].bss) continue; cfg80211_ref_bss(&rdev->wiphy, req->links[link_id].bss); cfg80211_hold_bss(bss_from_pub(req->links[link_id].bss)); } } return err; } int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *bssid, const u8 *ie, int ie_len, u16 reason, bool local_state_change) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_deauth_request req = { .bssid = bssid, .reason_code = reason, .ie = ie, .ie_len = ie_len, .local_state_change = local_state_change, }; lockdep_assert_wiphy(wdev->wiphy); if (local_state_change && (!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid))) return 0; if (ether_addr_equal(wdev->disconnect_bssid, bssid) || (wdev->connected && ether_addr_equal(wdev->u.client.connected_addr, bssid))) wdev->conn_owner_nlportid = 0; return rdev_deauth(rdev, dev, &req); } int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *ap_addr, const u8 *ie, int ie_len, u16 reason, bool local_state_change) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_disassoc_request req = { .reason_code = reason, .local_state_change = local_state_change, .ie = ie, .ie_len = ie_len, .ap_addr = ap_addr, }; int err; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->connected) return -ENOTCONN; if (memcmp(wdev->u.client.connected_addr, ap_addr, ETH_ALEN)) return -ENOTCONN; err = rdev_disassoc(rdev, dev, &req); if (err) return err; /* driver should have reported the disassoc */ WARN_ON(wdev->connected); return 0; } void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; u8 bssid[ETH_ALEN]; lockdep_assert_wiphy(wdev->wiphy); if (!rdev->ops->deauth) return; if (!wdev->connected) return; memcpy(bssid, wdev->u.client.connected_addr, ETH_ALEN); cfg80211_mlme_deauth(rdev, dev, bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); } struct cfg80211_mgmt_registration { struct list_head list; struct wireless_dev *wdev; u32 nlportid; int match_len; __le16 frame_type; bool multicast_rx; u8 match[]; }; static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct wireless_dev *tmp; struct cfg80211_mgmt_registration *reg; struct mgmt_frame_regs upd = {}; lockdep_assert_held(&rdev->wiphy.mtx); spin_lock_bh(&rdev->mgmt_registrations_lock); if (!wdev->mgmt_registrations_need_update) { spin_unlock_bh(&rdev->mgmt_registrations_lock); return; } rcu_read_lock(); list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { list_for_each_entry(reg, &tmp->mgmt_registrations, list) { u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); u32 mcast_mask = 0; if (reg->multicast_rx) mcast_mask = mask; upd.global_stypes |= mask; upd.global_mcast_stypes |= mcast_mask; if (tmp == wdev) { upd.interface_stypes |= mask; upd.interface_mcast_stypes |= mcast_mask; } } } rcu_read_unlock(); wdev->mgmt_registrations_need_update = 0; spin_unlock_bh(&rdev->mgmt_registrations_lock); rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; rdev = container_of(wk, struct cfg80211_registered_device, mgmt_registrations_update_wk); wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_mgmt_registrations_update(wdev); wiphy_unlock(&rdev->wiphy); } int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, int match_len, bool multicast_rx, struct netlink_ext_ack *extack) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; bool update_multicast = false; if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT) { NL_SET_ERR_MSG(extack, "frame type not management"); return -EINVAL; } if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) { NL_SET_ERR_MSG(extack, "Invalid frame type"); return -EINVAL; } mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4; if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type))) { NL_SET_ERR_MSG(extack, "Registration to specific type not supported"); return -EINVAL; } /* * To support Pre Association Security Negotiation (PASN), registration * for authentication frames should be supported. However, as some * versions of the user space daemons wrongly register to all types of * authentication frames (which might result in unexpected behavior) * allow such registration if the request is for a specific * authentication algorithm number. */ if (wdev->iftype == NL80211_IFTYPE_STATION && (frame_type & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_AUTH && !(match_data && match_len >= 2)) { NL_SET_ERR_MSG(extack, "Authentication algorithm number required"); return -EINVAL; } nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL); if (!nreg) return -ENOMEM; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { int mlen = min(match_len, reg->match_len); if (frame_type != le16_to_cpu(reg->frame_type)) continue; if (memcmp(reg->match, match_data, mlen) == 0) { if (reg->multicast_rx != multicast_rx) { update_multicast = true; reg->multicast_rx = multicast_rx; break; } NL_SET_ERR_MSG(extack, "Match already configured"); err = -EALREADY; break; } } if (err) goto out; if (update_multicast) { kfree(nreg); } else { memcpy(nreg->match, match_data, match_len); nreg->match_len = match_len; nreg->nlportid = snd_portid; nreg->frame_type = cpu_to_le16(frame_type); nreg->wdev = wdev; nreg->multicast_rx = multicast_rx; list_add(&nreg->list, &wdev->mgmt_registrations); } wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); return 0; out: kfree(nreg); spin_unlock_bh(&rdev->mgmt_registrations_lock); return err; } void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { if (reg->nlportid != nlportid) continue; list_del(&reg->list); kfree(reg); wdev->mgmt_registrations_need_update = 1; schedule_work(&rdev->mgmt_registrations_update_wk); } spin_unlock_bh(&rdev->mgmt_registrations_lock); if (nlportid && rdev->crit_proto_nlportid == nlportid) { rdev->crit_proto_nlportid = 0; rdev_crit_proto_stop(rdev, wdev); } if (nlportid == wdev->ap_unexpected_nlportid) wdev->ap_unexpected_nlportid = 0; } void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { list_del(&reg->list); kfree(reg); } wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); } static bool cfg80211_allowed_address(struct wireless_dev *wdev, const u8 *addr) { int i; for_each_valid_link(wdev, i) { if (ether_addr_equal(addr, wdev->links[i].addr)) return true; } return ether_addr_equal(addr, wdev_address(wdev)); } static bool cfg80211_allowed_random_address(struct wireless_dev *wdev, const struct ieee80211_mgmt *mgmt) { if (ieee80211_is_auth(mgmt->frame_control) || ieee80211_is_deauth(mgmt->frame_control)) { /* Allow random TA to be used with authentication and * deauthentication frames if the driver has indicated support. */ if (wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA)) return true; } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) { /* Allow random TA to be used with Public Action frames if the * driver has indicated support. */ if (!wdev->connected && wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA)) return true; if (wdev->connected && wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED)) return true; } return false; } int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { const struct ieee80211_mgmt *mgmt; u16 stype; lockdep_assert_wiphy(&rdev->wiphy); if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; if (!rdev->ops->mgmt_tx) return -EOPNOTSUPP; if (params->len < 24 + 1) return -EINVAL; mgmt = (const struct ieee80211_mgmt *)params->buf; if (!ieee80211_is_mgmt(mgmt->frame_control)) return -EINVAL; stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE; if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4))) return -EINVAL; if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) { int err = 0; switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: /* * check for IBSS DA must be done by driver as * cfg80211 doesn't track the stations */ if (!wdev->u.ibss.current_bss || !ether_addr_equal(wdev->u.ibss.current_bss->pub.bssid, mgmt->bssid)) { err = -ENOTCONN; break; } break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: if (!wdev->connected) { err = -ENOTCONN; break; } /* FIXME: MLD may address this differently */ if (!ether_addr_equal(wdev->u.client.connected_addr, mgmt->bssid)) { err = -ENOTCONN; break; } /* for station, check that DA is the AP */ if (!ether_addr_equal(wdev->u.client.connected_addr, mgmt->da)) { err = -ENOTCONN; break; } break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP_VLAN: if (!ether_addr_equal(mgmt->bssid, wdev_address(wdev)) && (params->link_id < 0 || !ether_addr_equal(mgmt->bssid, wdev->links[params->link_id].addr))) err = -EINVAL; break; case NL80211_IFTYPE_MESH_POINT: if (!ether_addr_equal(mgmt->sa, mgmt->bssid)) { err = -EINVAL; break; } /* * check for mesh DA must be done by driver as * cfg80211 doesn't track the stations */ break; case NL80211_IFTYPE_P2P_DEVICE: /* * fall through, P2P device only supports * public action frames */ case NL80211_IFTYPE_NAN: default: err = -EOPNOTSUPP; break; } if (err) return err; } if (!cfg80211_allowed_address(wdev, mgmt->sa) && !cfg80211_allowed_random_address(wdev, mgmt)) return -EINVAL; /* Transmit the management frame as requested by user space */ return rdev_mgmt_tx(rdev, wdev, params, cookie); } bool cfg80211_rx_mgmt_ext(struct wireless_dev *wdev, struct cfg80211_rx_info *info) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg; const struct ieee80211_txrx_stypes *stypes = &wiphy->mgmt_stypes[wdev->iftype]; struct ieee80211_mgmt *mgmt = (void *)info->buf; const u8 *data; int data_len; bool result = false; __le16 ftype = mgmt->frame_control & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE); u16 stype; trace_cfg80211_rx_mgmt(wdev, info); stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4; if (!(stypes->rx & BIT(stype))) { trace_cfg80211_return_bool(false); return false; } data = info->buf + ieee80211_hdrlen(mgmt->frame_control); data_len = info->len - ieee80211_hdrlen(mgmt->frame_control); spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { if (reg->frame_type != ftype) continue; if (reg->match_len > data_len) continue; if (memcmp(reg->match, data, reg->match_len)) continue; /* found match! */ /* Indicate the received Action frame to user space */ if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, info, GFP_ATOMIC)) continue; result = true; break; } spin_unlock_bh(&rdev->mgmt_registrations_lock); trace_cfg80211_return_bool(result); return result; } EXPORT_SYMBOL(cfg80211_rx_mgmt_ext); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev) { cancel_delayed_work(&rdev->dfs_update_channels_wk); queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, 0); } void cfg80211_dfs_channels_update_work(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct cfg80211_registered_device *rdev; struct cfg80211_chan_def chandef; struct ieee80211_supported_band *sband; struct ieee80211_channel *c; struct wiphy *wiphy; bool check_again = false; unsigned long timeout, next_time = 0; unsigned long time_dfs_update; enum nl80211_radar_event radar_event; int bandid, i; rdev = container_of(delayed_work, struct cfg80211_registered_device, dfs_update_channels_wk); wiphy = &rdev->wiphy; rtnl_lock(); for (bandid = 0; bandid < NUM_NL80211_BANDS; bandid++) { sband = wiphy->bands[bandid]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { c = &sband->channels[i]; if (!(c->flags & IEEE80211_CHAN_RADAR)) continue; if (c->dfs_state != NL80211_DFS_UNAVAILABLE && c->dfs_state != NL80211_DFS_AVAILABLE) continue; if (c->dfs_state == NL80211_DFS_UNAVAILABLE) { time_dfs_update = IEEE80211_DFS_MIN_NOP_TIME_MS; radar_event = NL80211_RADAR_NOP_FINISHED; } else { if (regulatory_pre_cac_allowed(wiphy) || cfg80211_any_wiphy_oper_chan(wiphy, c)) continue; time_dfs_update = REG_PRE_CAC_EXPIRY_GRACE_MS; radar_event = NL80211_RADAR_PRE_CAC_EXPIRED; } timeout = c->dfs_state_entered + msecs_to_jiffies(time_dfs_update); if (time_after_eq(jiffies, timeout)) { c->dfs_state = NL80211_DFS_USABLE; c->dfs_state_entered = jiffies; cfg80211_chandef_create(&chandef, c, NL80211_CHAN_NO_HT); nl80211_radar_notify(rdev, &chandef, radar_event, NULL, GFP_ATOMIC); regulatory_propagate_dfs_state(wiphy, &chandef, c->dfs_state, radar_event); continue; } if (!check_again) next_time = timeout - jiffies; else next_time = min(next_time, timeout - jiffies); check_again = true; } } rtnl_unlock(); /* reschedule if there are other channels waiting to be cleared again */ if (check_again) queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, next_time); } void __cfg80211_radar_event(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, bool offchan, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_radar_event(wiphy, chandef, offchan); /* only set the chandef supplied channel to unavailable, in * case the radar is detected on only one of multiple channels * spanned by the chandef. */ cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_UNAVAILABLE); if (offchan) queue_work(cfg80211_wq, &rdev->background_cac_abort_wk); cfg80211_sched_dfs_chan_update(rdev); nl80211_radar_notify(rdev, chandef, NL80211_RADAR_DETECTED, NULL, gfp); memcpy(&rdev->radar_chandef, chandef, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk); } EXPORT_SYMBOL(__cfg80211_radar_event); void cfg80211_cac_event(struct net_device *netdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event, gfp_t gfp, unsigned int link_id) { struct wireless_dev *wdev = netdev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long timeout; if (WARN_ON(wdev->valid_links && !(wdev->valid_links & BIT(link_id)))) return; trace_cfg80211_cac_event(netdev, event, link_id); if (WARN_ON(!wdev->links[link_id].cac_started && event != NL80211_RADAR_CAC_STARTED)) return; switch (event) { case NL80211_RADAR_CAC_FINISHED: timeout = wdev->links[link_id].cac_start_time + msecs_to_jiffies(wdev->links[link_id].cac_time_ms); WARN_ON(!time_after_eq(jiffies, timeout)); cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); memcpy(&rdev->cac_done_chandef, chandef, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); cfg80211_sched_dfs_chan_update(rdev); fallthrough; case NL80211_RADAR_CAC_ABORTED: wdev->links[link_id].cac_started = false; break; case NL80211_RADAR_CAC_STARTED: wdev->links[link_id].cac_started = true; break; default: WARN_ON(1); return; } nl80211_radar_notify(rdev, chandef, event, netdev, gfp); } EXPORT_SYMBOL(cfg80211_cac_event); static void __cfg80211_background_cac_event(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event) { struct wiphy *wiphy = &rdev->wiphy; struct net_device *netdev; lockdep_assert_wiphy(&rdev->wiphy); if (!cfg80211_chandef_valid(chandef)) return; if (!rdev->background_radar_wdev) return; switch (event) { case NL80211_RADAR_CAC_FINISHED: cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); memcpy(&rdev->cac_done_chandef, chandef, sizeof(*chandef)); queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); cfg80211_sched_dfs_chan_update(rdev); wdev = rdev->background_radar_wdev; break; case NL80211_RADAR_CAC_ABORTED: if (!cancel_delayed_work(&rdev->background_cac_done_wk)) return; wdev = rdev->background_radar_wdev; break; case NL80211_RADAR_CAC_STARTED: break; default: return; } netdev = wdev ? wdev->netdev : NULL; nl80211_radar_notify(rdev, chandef, event, netdev, GFP_KERNEL); } static void cfg80211_background_cac_event(struct cfg80211_registered_device *rdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event) { wiphy_lock(&rdev->wiphy); __cfg80211_background_cac_event(rdev, rdev->background_radar_wdev, chandef, event); wiphy_unlock(&rdev->wiphy); } void cfg80211_background_cac_done_wk(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct cfg80211_registered_device *rdev; rdev = container_of(delayed_work, struct cfg80211_registered_device, background_cac_done_wk); cfg80211_background_cac_event(rdev, &rdev->background_radar_chandef, NL80211_RADAR_CAC_FINISHED); } void cfg80211_background_cac_abort_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, background_cac_abort_wk); cfg80211_background_cac_event(rdev, &rdev->background_radar_chandef, NL80211_RADAR_CAC_ABORTED); } void cfg80211_background_cac_abort(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); queue_work(cfg80211_wq, &rdev->background_cac_abort_wk); } EXPORT_SYMBOL(cfg80211_background_cac_abort); int cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef) { unsigned int cac_time_ms; int err; lockdep_assert_wiphy(&rdev->wiphy); if (!wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_RADAR_BACKGROUND)) return -EOPNOTSUPP; /* Offchannel chain already locked by another wdev */ if (rdev->background_radar_wdev && rdev->background_radar_wdev != wdev)