Total coverage: 70239 (4%)of 1834410
667 134 53 53 53 53 53 59 181 176 176 524 685 684 2 198 2 604 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the IP router. * * Version: @(#)route.h 1.0.4 05/27/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Fixes: * Alan Cox : Reformatted. Added ip_rt_local() * Alan Cox : Support for TCP parameters. * Alexey Kuznetsov: Major changes for new routing code. * Mike McLagan : Routing by source * Robert Olsson : Added rt_cache statistics */ #ifndef _ROUTE_H #define _ROUTE_H #include <net/dst.h> #include <net/inetpeer.h> #include <net/flow.h> #include <net/inet_sock.h> #include <net/ip_fib.h> #include <net/arp.h> #include <net/ndisc.h> #include <net/inet_dscp.h> #include <net/sock.h> #include <linux/in_route.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> #include <linux/route.h> #include <linux/ip.h> #include <linux/cache.h> #include <linux/security.h> static inline __u8 ip_sock_rt_scope(const struct sock *sk) { if (sock_flag(sk, SOCK_LOCALROUTE)) return RT_SCOPE_LINK; return RT_SCOPE_UNIVERSE; } static inline __u8 ip_sock_rt_tos(const struct sock *sk) { return READ_ONCE(inet_sk(sk)->tos) & INET_DSCP_MASK; } struct ip_tunnel_info; struct fib_nh; struct fib_info; struct uncached_list; struct rtable { struct dst_entry dst; int rt_genid; unsigned int rt_flags; __u16 rt_type; __u8 rt_is_input; __u8 rt_uses_gateway; int rt_iif; u8 rt_gw_family; /* Info on neighbour */ union { __be32 rt_gw4; struct in6_addr rt_gw6; }; /* Miscellaneous cached information */ u32 rt_mtu_locked:1, rt_pmtu:31; }; #define dst_rtable(_ptr) container_of_const(_ptr, struct rtable, dst) /** * skb_rtable - Returns the skb &rtable * @skb: buffer */ static inline struct rtable *skb_rtable(const struct sk_buff *skb) { return dst_rtable(skb_dst(skb)); } static inline bool rt_is_input_route(const struct rtable *rt) { return rt->rt_is_input != 0; } static inline bool rt_is_output_route(const struct rtable *rt) { return rt->rt_is_input == 0; } static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr) { if (rt->rt_gw_family == AF_INET) return rt->rt_gw4; return daddr; } struct ip_rt_acct { __u32 o_bytes; __u32 o_packets; __u32 i_bytes; __u32 i_packets; }; struct rt_cache_stat { unsigned int in_slow_tot; unsigned int in_slow_mc; unsigned int in_no_route; unsigned int in_brd; unsigned int in_martian_dst; unsigned int in_martian_src; unsigned int out_slow_tot; unsigned int out_slow_mc; }; extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); static inline void inet_sk_init_flowi4(const struct inet_sock *inet, struct flowi4 *fl4) { const struct ip_options_rcu *ip4_opt; const struct sock *sk; __be32 daddr; rcu_read_lock(); ip4_opt = rcu_dereference(inet->inet_opt); /* Source routing option overrides the socket destination address */ if (ip4_opt && ip4_opt->opt.srr) daddr = ip4_opt->opt.faddr; else daddr = inet->inet_daddr; rcu_read_unlock(); sk = &inet->sk; flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_uid); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); } struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp, const struct sk_buff *skb); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp, struct fib_result *res, const struct sk_buff *skb); static inline struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp) { return ip_route_output_key_hash(net, flp, NULL); } struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, const struct sock *sk); struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig); static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 *flp) { return ip_route_output_flow(net, flp, NULL); } /* Simplistic IPv4 route lookup function. * This is only suitable for some particular use cases: since the flowi4 * structure is only partially set, it may bypass some fib-rules. */ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, __be32 saddr, dscp_t dscp, int oif, __u8 scope) { struct flowi4 fl4 = { .flowi4_oif = oif, .flowi4_tos = inet_dscp_to_dsfield(dscp), .flowi4_scope = scope, .daddr = daddr, .saddr = saddr, }; return ip_route_output_key(net, &fl4); } static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4, const struct sock *sk, __be32 daddr, __be32 saddr, __be16 dport, __be16 sport, __u8 proto, __u8 tos, int oif) { flowi4_init_output(fl4, oif, sk ? READ_ONCE(sk->sk_mark) : 0, tos, sk ? ip_sock_rt_scope(sk) : RT_SCOPE_UNIVERSE, proto, sk ? inet_sk_flowi_flags(sk) : 0, daddr, saddr, dport, sport, sock_net_uid(net, sk)); if (sk) security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag); enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); enum skb_drop_reason ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); static inline enum skb_drop_reason ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, struct net_device *devin) { enum skb_drop_reason reason; rcu_read_lock(); reason = ip_route_input_noref(skb, dst, src, dscp, devin); if (!reason) { skb_dst_force(skb); if (!skb_dst(skb)) reason = SKB_DROP_REASON_NOT_SPECIFIED; } rcu_read_unlock(); return reason; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u8 protocol); void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk); void ip_rt_send_redirect(struct sk_buff *skb); unsigned int inet_addr_type(struct net *net, __be32 addr); unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr); unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr); void ip_rt_multicast_event(struct in_device *); int ip_rt_ioctl(struct net *, unsigned int cmd, struct rtentry *rt); void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt); struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, bool noxfrm); struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt); struct in_ifaddr; void fib_add_ifaddr(struct in_ifaddr *); void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *); void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric); void rt_add_uncached_list(struct rtable *rt); void rt_del_uncached_list(struct rtable *rt); int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fib_info *fi, int *fa_index, int fa_start, unsigned int flags); static inline void ip_rt_put(struct rtable *rt) { /* dst_release() accepts a NULL parameter. * We rely on dst being first structure in struct rtable */ BUILD_BUG_ON(offsetof(struct rtable, dst) != 0); dst_release(&rt->dst); } extern const __u8 ip_tos2prio[16]; static inline char rt_tos2priority(u8 tos) { return ip_tos2prio[IPTOS_TOS(tos)>>1]; } /* ip_route_connect() and ip_route_newports() work in tandem whilst * binding a socket for a new outgoing connection. * * In order to use IPSEC properly, we must, in the end, have a * route that was looked up using all available keys including source * and destination ports. * * However, if a source port needs to be allocated (the user specified * a wildcard source port) we need to obtain addressing information * in order to perform that allocation. * * So ip_route_connect() looks up a route using wildcarded source and * destination ports in the key, simply so that we can get a pair of * addresses to use for port allocation. * * Later, once the ports are allocated, ip_route_newports() will make * another route lookup if needed to make sure we catch any IPSEC * rules keyed on the port information. * * The callers allocate the flow key on their stack, and must pass in * the same flowi4 object to both the ip_route_connect() and the * ip_route_newports() calls. */ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 src, int oif, u8 protocol, __be16 sport, __be16 dport, const struct sock *sk) { __u8 flow_flags = 0; if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, src, dport, sport, sk->sk_uid); } static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, __be32 src, int oif, u8 protocol, __be16 sport, __be16 dport, const struct sock *sk) { struct net *net = sock_net(sk); struct rtable *rt; ip_route_connect_init(fl4, dst, src, oif, protocol, sport, dport, sk); if (!dst || !src) { rt = __ip_route_output_key(net, fl4); if (IS_ERR(rt)) return rt; ip_rt_put(rt); flowi4_update_output(fl4, oif, fl4->daddr, fl4->saddr); } security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt, __be16 orig_sport, __be16 orig_dport, __be16 sport, __be16 dport, const struct sock *sk) { if (sport != orig_sport || dport != orig_dport) { fl4->fl4_dport = dport; fl4->fl4_sport = sport; ip_rt_put(rt); flowi4_update_output(fl4, sk->sk_bound_dev_if, fl4->daddr, fl4->saddr); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(sock_net(sk), fl4, sk); } return rt; } static inline int inet_iif(const struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); if (rt && rt->rt_iif) return rt->rt_iif; return skb->skb_iif; } static inline int ip4_dst_hoplimit(const struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); struct net *net = dev_net(dst->dev); if (hoplimit == 0) hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); return hoplimit; } static inline struct neighbour *ip_neigh_gw4(struct net_device *dev, __be32 daddr) { struct neighbour *neigh; neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &daddr, dev, false); return neigh; } static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt, struct sk_buff *skb, bool *is_v6gw) { struct net_device *dev = rt->dst.dev; struct neighbour *neigh; if (likely(rt->rt_gw_family == AF_INET)) { neigh = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { neigh = ip_neigh_gw6(dev, &rt->rt_gw6); *is_v6gw = true; } else { neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr); } return neigh; } #endif /* _ROUTE_H */
9 9 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 // SPDX-License-Identifier: GPL-2.0 /* * Interface for controlling IO bandwidth on a request queue * * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> #include "blk.h" #include "blk-cgroup-rwstat.h" #include "blk-stat.h" #include "blk-throttle.h" /* Max dispatch from a group in 1 round */ #define THROTL_GRP_QUANTUM 8 /* Total max dispatch from all groups in one round */ #define THROTL_QUANTUM 32 /* Throttling is performed over a slice and after that slice is renewed */ #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ) /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) struct throtl_data { /* service tree for active throtl groups */ struct throtl_service_queue service_queue; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; unsigned int throtl_slice; /* Work for dispatching throttled bios */ struct work_struct dispatch_work; bool track_bio_latency; }; static void throtl_pending_timer_fn(struct timer_list *t); static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) { return pd_to_blkg(&tg->pd); } /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest * * Return the throtl_grp @sq belongs to. If @sq is the top-level one * embedded in throtl_data, %NULL is returned. */ static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) { if (sq && sq->parent_sq) return container_of(sq, struct throtl_grp, service_queue); else return NULL; } /** * sq_to_td - return throtl_data the specified service queue belongs to * @sq: the throtl_service_queue of interest * * A service_queue can be embedded in either a throtl_grp or throtl_data. * Determine the associated throtl_data accordingly and return it. */ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) { struct throtl_grp *tg = sq_to_tg(sq); if (tg) return tg->td; else return container_of(sq, struct throtl_data, service_queue); } static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return U64_MAX; return tg->bps[rw]; } static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return UINT_MAX; return tg->iops[rw]; } /** * throtl_log - log debug message via blktrace * @sq: the service_queue being reported * @fmt: printf format string * @args: printf args * * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a * throtl_grp; otherwise, just "throtl". */ #define throtl_log(sq, fmt, args...) do { \ struct throtl_grp *__tg = sq_to_tg((sq)); \ struct throtl_data *__td = sq_to_td((sq)); \ \ (void)__td; \ if (likely(!blk_trace_note_message_enabled(__td->queue))) \ break; \ if ((__tg)) { \ blk_add_cgroup_trace_msg(__td->queue, \ &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ } while (0) static inline unsigned int throtl_bio_data_size(struct bio *bio) { /* assume it's one sector */ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) return 512; return bio->bi_iter.bi_size; } static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); bio_list_init(&qn->bios); qn->tg = tg; } /** * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it * @bio: bio being added * @qn: qnode to add bio to * @queued: the service_queue->queued[] list @qn belongs to * * Add @bio to @qn and put @qn on @queued if it's not already on. * @qn->tg's reference count is bumped when @qn is activated. See the * comment on top of throtl_qnode definition for details. */ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, struct list_head *queued) { bio_list_add(&qn->bios, bio); if (list_empty(&qn->node)) { list_add_tail(&qn->node, queued); blkg_get(tg_to_blkg(qn->tg)); } } /** * throtl_peek_queued - peek the first bio on a qnode list * @queued: the qnode list to peek */ static struct bio *throtl_peek_queued(struct list_head *queued) { struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_peek(&qn->bios); WARN_ON_ONCE(!bio); return bio; } /** * throtl_pop_queued - pop the first bio form a qnode list * @queued: the qnode list to pop a bio from * @tg_to_put: optional out argument for throtl_grp to put * * Pop the first bio from the qnode list @queued. After popping, the first * qnode is removed from @queued if empty or moved to the end of @queued so * that the popping order is round-robin. * * When the first qnode is removed, its associated throtl_grp should be put * too. If @tg_to_put is NULL, this function automatically puts it; * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is * responsible for putting it. */ static struct bio *throtl_pop_queued(struct list_head *queued, struct throtl_grp **tg_to_put) { struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_pop(&qn->bios); WARN_ON_ONCE(!bio); if (bio_list_empty(&qn->bios)) { list_del_init(&qn->node); if (tg_to_put) *tg_to_put = qn->tg; else blkg_put(tg_to_blkg(qn->tg)); } else { list_move_tail(&qn->node, queued); } return bio; } /* init a service_queue, assumes the caller zeroed it */ static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[READ]); INIT_LIST_HEAD(&sq->queued[WRITE]); sq->pending_tree = RB_ROOT_CACHED; timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct throtl_grp *tg; int rw; tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); if (!tg) return NULL; if (blkg_rwstat_init(&tg->stat_bytes, gfp)) goto err_free_tg; if (blkg_rwstat_init(&tg->stat_ios, gfp)) goto err_exit_stat_bytes; throtl_service_queue_init(&tg->service_queue); for (rw = READ; rw <= WRITE; rw++) { throtl_qnode_init(&tg->qnode_on_self[rw], tg); throtl_qnode_init(&tg->qnode_on_parent[rw], tg); } RB_CLEAR_NODE(&tg->rb_node); tg->bps[READ] = U64_MAX; tg->bps[WRITE] = U64_MAX; tg->iops[READ] = UINT_MAX; tg->iops[WRITE] = UINT_MAX; return &tg->pd; err_exit_stat_bytes: blkg_rwstat_exit(&tg->stat_bytes); err_free_tg: kfree(tg); return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); struct blkcg_gq *blkg = tg_to_blkg(tg); struct throtl_data *td = blkg->q->td; struct throtl_service_queue *sq = &tg->service_queue; /* * If on the default hierarchy, we switch to properly hierarchical * behavior where limits on a given throtl_grp are applied to the * whole subtree rather than just the group itself. e.g. If 16M * read_bps limit is set on a parent group, summary bps of * parent group and its subtree groups can't exceed 16M for the * device. * * If not on the default hierarchy, the broken flat hierarchy * behavior is retained where all throtl_grps are treated as if * they're all separate root groups right below throtl_data. * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy. */ sq->parent_sq = &td->service_queue; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; } /* * Set has_rules[] if @tg or any of its parents have limits configured. * This doesn't require walking up to the top of the hierarchy as the * parent's has_rules[] is guaranteed to be correct. */ static void tg_update_has_rules(struct throtl_grp *tg) { struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); int rw; for (rw = READ; rw <= WRITE; rw++) { tg->has_rules_iops[rw] = (parent_tg && parent_tg->has_rules_iops[rw]) || tg_iops_limit(tg, rw) != UINT_MAX; tg->has_rules_bps[rw] = (parent_tg && parent_tg->has_rules_bps[rw]) || tg_bps_limit(tg, rw) != U64_MAX; } } static void throtl_pd_online(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ tg_update_has_rules(tg); } static void throtl_pd_free(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); blkg_rwstat_exit(&tg->stat_bytes); blkg_rwstat_exit(&tg->stat_ios); kfree(tg); } static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { struct rb_node *n; n = rb_first_cached(&parent_sq->pending_tree); WARN_ON_ONCE(!n); if (!n) return NULL; return rb_entry_tg(n); } static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *parent_sq) { rb_erase_cached(n, &parent_sq->pending_tree); RB_CLEAR_NODE(n); } static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) { struct throtl_grp *tg; tg = throtl_rb_first(parent_sq); if (!tg) return; parent_sq->first_pending_disptime = tg->disptime; } static void tg_service_queue_add(struct throtl_grp *tg) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; struct rb_node *parent = NULL; struct throtl_grp *__tg; unsigned long key = tg->disptime; bool leftmost = true; while (*node != NULL) { parent = *node; __tg = rb_entry_tg(parent); if (time_before(key, __tg->disptime)) node = &parent->rb_left; else { node = &parent->rb_right; leftmost = false; } } rb_link_node(&tg->rb_node, parent, node); rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, leftmost); } static void throtl_enqueue_tg(struct throtl_grp *tg) { if (!(tg->flags & THROTL_TG_PENDING)) { tg_service_queue_add(tg); tg->flags |= THROTL_TG_PENDING; tg->service_queue.parent_sq->nr_pending++; } } static void throtl_dequeue_tg(struct throtl_grp *tg) { if (tg->flags & THROTL_TG_PENDING) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; throtl_rb_erase(&tg->rb_node, parent_sq); --parent_sq->nr_pending; tg->flags &= ~THROTL_TG_PENDING; } } /* Call with queue lock held */ static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, unsigned long expires) { unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; /* * Since we are adjusting the throttle limit dynamically, the sleep * time calculated according to previous limit might be invalid. It's * possible the cgroup sleep time is very long and no other cgroups * have IO running so notify the limit changes. Make sure the cgroup * doesn't sleep too long to avoid the missed notification. */ if (time_after(expires, max_expire)) expires = max_expire; mod_timer(&sq->pending_timer, expires); throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", expires - jiffies, jiffies); } /** * throtl_schedule_next_dispatch - schedule the next dispatch cycle * @sq: the service_queue to schedule dispatch for * @force: force scheduling * * Arm @sq->pending_timer so that the next dispatch cycle starts on the * dispatch time of the first pending child. Returns %true if either timer * is armed or there's no pending child left. %false if the current * dispatch window is still open and the caller should continue * dispatching. * * If @force is %true, the dispatch timer is always scheduled and this * function is guaranteed to return %true. This is to be used when the * caller can't dispatch itself and needs to invoke pending_timer * unconditionally. Note that forced scheduling is likely to induce short * delay before dispatch starts even if @sq->first_pending_disptime is not * in the future and thus shouldn't be used in hot paths. */ static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, bool force) { /* any pending children left? */ if (!sq->nr_pending) return true; update_min_dispatch_time(sq); /* is the next dispatch time in the future? */ if (force || time_after(sq->first_pending_disptime, jiffies)) { throtl_schedule_pending_timer(sq, sq->first_pending_disptime); return true; } /* tell the caller to continue dispatching */ return false; } static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, bool rw, unsigned long start) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->carryover_bytes[rw] = 0; tg->carryover_ios[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used * that bandwidth. Do try to make use of that bandwidth while giving * credit. */ if (time_after(start, tg->slice_start[rw])) tg->slice_start[rw] = start; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, bool clear_carryover) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; if (clear_carryover) { tg->carryover_bytes[rw] = 0; tg->carryover_ios[rw] = 0; } throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); } static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { throtl_set_slice_end(tg, rw, jiffy_end); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } /* Determine if previously allocated or extended slice is complete or not */ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) { if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) return false; return true; } static unsigned int calculate_io_allowed(u32 iops_limit, unsigned long jiffy_elapsed) { unsigned int io_allowed; u64 tmp; /* * jiffy_elapsed should not be a big value as minimum iops can be * 1 then at max jiffy elapsed should be equivalent of 1 second as we * will allow dispatch after 1 second and after that slice should * have been trimmed. */ tmp = (u64)iops_limit * jiffy_elapsed; do_div(tmp, HZ); if (tmp > UINT_MAX) io_allowed = UINT_MAX; else io_allowed = tmp; return io_allowed; } static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) { /* * Can result be wider than 64 bits? * We check against 62, not 64, due to ilog2 truncation. */ if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62) return U64_MAX; return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); } /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { unsigned long time_elapsed; long long bytes_trim; int io_trim; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); /* * If bps are unlimited (-1), then time slice don't get * renewed. Don't try to trim the slice if slice is used. A new * slice will start when appropriate. */ if (throtl_slice_used(tg, rw)) return; /* * A bio has been dispatched. Also adjust slice_end. It might happen * that initially cgroup limit was very low resulting in high * slice_end, but later limit was bumped up and bio was dispatched * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = rounddown(jiffies - tg->slice_start[rw], tg->td->throtl_slice); if (!time_elapsed) return; bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), time_elapsed) + tg->carryover_bytes[rw]; io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + tg->carryover_ios[rw]; if (bytes_trim <= 0 && io_trim <= 0) return; tg->carryover_bytes[rw] = 0; if ((long long)tg->bytes_disp[rw] >= bytes_trim) tg->bytes_disp[rw] -= bytes_trim; else tg->bytes_disp[rw] = 0; tg->carryover_ios[rw] = 0; if ((int)tg->io_disp[rw] >= io_trim) tg->io_disp[rw] -= io_trim; else tg->io_disp[rw] = 0; tg->slice_start[rw] += time_elapsed; throtl_log(&tg->service_queue, "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); } static void __tg_update_carryover(struct throtl_grp *tg, bool rw) { unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); /* * If config is updated while bios are still throttled, calculate and * accumulate how many bytes/ios are waited across changes. And * carryover_bytes/ios will be used to calculate new wait time under new * configuration. */ if (bps_limit != U64_MAX) tg->carryover_bytes[rw] += calculate_bytes_allowed(bps_limit, jiffy_elapsed) - tg->bytes_disp[rw]; if (iops_limit != UINT_MAX) tg->carryover_ios[rw] += calculate_io_allowed(iops_limit, jiffy_elapsed) - tg->io_disp[rw]; } static void tg_update_carryover(struct throtl_grp *tg) { if (tg->service_queue.nr_queued[READ]) __tg_update_carryover(tg, READ); if (tg->service_queue.nr_queued[WRITE]) __tg_update_carryover(tg, WRITE); /* see comments in struct throtl_grp for meaning of these fields. */ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], tg->carryover_ios[READ], tg->carryover_ios[WRITE]); } static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, u32 iops_limit) { bool rw = bio_data_dir(bio); int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; if (iops_limit == UINT_MAX) { return 0; } jiffy_elapsed = jiffies - tg->slice_start[rw]; /* Round up to the next throttle slice, wait time must be nonzero */ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + tg->carryover_ios[rw]; if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; /* make sure at least one io can be dispatched after waiting */ jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); return jiffy_wait; } static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, u64 bps_limit) { bool rw = bio_data_dir(bio); long long bytes_allowed; u64 extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); /* no need to throttle if this bio's bytes have been accounted */ if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { return 0; } jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + tg->carryover_bytes[rw]; if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) return 0; /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); if (!jiffy_wait) jiffy_wait = 1; /* * This wait time is without taking into consideration the rounding * up we did. Add that time also. */ jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); return jiffy_wait; } /* * Returns whether one can dispatch a bio or not. Also returns approx number * of jiffies to wait before this bio is with-in IO rate and can be dispatched */ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); /* * Currently whole state machine of group depends on first bio * queued in the group bio list. So one should not be calling * this function with a different bio if there are other bios * queued. */ BUG_ON(tg->service_queue.nr_queued[rw] && bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || tg->flags & THROTL_TG_CANCELING) { if (wait) *wait = 0; return true; } /* * If previous slice expired, start a new one otherwise renew/extend * existing slice to make sure it is at least throtl_slice interval * long since now. New slice is started only for empty throttle group. * If there is queued bio, that means there should be an active * slice and it should be extended instead. */ if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) throtl_start_new_slice(tg, rw, true); else { if (time_before(tg->slice_end[rw], jiffies + tg->td->throtl_slice)) throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice); } bps_wait = tg_within_bps_limit(tg, bio, bps_limit); iops_wait = tg_within_iops_limit(tg, bio, iops_limit); if (bps_wait + iops_wait == 0) { if (wait) *wait = 0; return true; } max_wait = max(bps_wait, iops_wait); if (wait) *wait = max_wait; if (time_before(tg->slice_end[rw], jiffies + max_wait)) throtl_extend_slice(tg, rw, jiffies + max_wait); return false; } static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { tg->bytes_disp[rw] += bio_size; tg->last_bytes_disp[rw] += bio_size; } tg->io_disp[rw]++; tg->last_io_disp[rw]++; } /** * throtl_add_bio_tg - add a bio to the specified throtl_grp * @bio: bio to add * @qn: qnode to use * @tg: the target throtl_grp * * Add @bio to @tg's service_queue using @qn. If @qn is not specified, * tg->qnode_on_self[] is used. */ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; bool rw = bio_data_dir(bio); if (!qn) qn = &tg->qnode_on_self[rw]; /* * If @tg doesn't currently have any bios queued in the same * direction, queueing @bio can change when @tg should be * dispatched. Mark that @tg was empty. This is automatically * cleared on the next tg_update_disptime(). */ if (!sq->nr_queued[rw]) tg->flags |= THROTL_TG_WAS_EMPTY; throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); sq->nr_queued[rw]++; throtl_enqueue_tg(tg); } static void tg_update_disptime(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; struct bio *bio; bio = throtl_peek_queued(&sq->queued[READ]); if (bio) tg_may_dispatch(tg, bio, &read_wait); bio = throtl_peek_queued(&sq->queued[WRITE]); if (bio) tg_may_dispatch(tg, bio, &write_wait); min_wait = min(read_wait, write_wait); disptime = jiffies + min_wait; /* Update dispatch time */ throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); tg->disptime = disptime; tg_service_queue_add(tg); /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; } static void start_parent_slice_with_credit(struct throtl_grp *child_tg, struct throtl_grp *parent_tg, bool rw) { if (throtl_slice_used(parent_tg, rw)) { throtl_start_new_slice_with_credit(parent_tg, rw, child_tg->slice_start[rw]); } } static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) { struct throtl_service_queue *sq = &tg->service_queue; struct throtl_service_queue *parent_sq = sq->parent_sq; struct throtl_grp *parent_tg = sq_to_tg(parent_sq); struct throtl_grp *tg_to_put = NULL; struct bio *bio; /* * @bio is being transferred from @tg to @parent_sq. Popping a bio * from @tg may put its reference and @parent_sq might end up * getting released prematurely. Remember the tg to put and put it * after @bio is transferred to @parent_sq. */ bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); sq->nr_queued[rw]--; throtl_charge_bio(tg, bio); /* * If our parent is another tg, we just need to transfer @bio to * the parent using throtl_add_bio_tg(). If our parent is * @td->service_queue, @bio is ready to be issued. Put it on its * bio_lists[] and decrease total number queued. The caller is * responsible for issuing these bios. */ if (parent_tg) { throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); start_parent_slice_with_credit(tg, parent_tg, rw); } else { bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], &parent_sq->queued[rw]); BUG_ON(tg->td->nr_queued[rw] <= 0); tg->td->nr_queued[rw]--; } throtl_trim_slice(tg, rw); if (tg_to_put) blkg_put(tg_to_blkg(tg_to_put)); } static int throtl_dispatch_tg(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned int nr_reads = 0, nr_writes = 0; unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; struct bio *bio; /* Try to dispatch 75% READS and 25% WRITES */ while ((bio = throtl_peek_queued(&sq->queued[READ])) && tg_may_dispatch(tg, bio, NULL)) { tg_dispatch_one_bio(tg, READ); nr_reads++; if (nr_reads >= max_nr_reads) break; } while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && tg_may_dispatch(tg, bio, NULL)) { tg_dispatch_one_bio(tg, WRITE); nr_writes++; if (nr_writes >= max_nr_writes) break; } return nr_reads + nr_writes; } static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) { unsigned int nr_disp = 0; while (1) { struct throtl_grp *tg; struct throtl_service_queue *sq; if (!parent_sq->nr_pending) break; tg = throtl_rb_first(parent_sq); if (!tg) break; if (time_before(jiffies, tg->disptime)) break; nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) tg_update_disptime(tg); else throtl_dequeue_tg(tg); if (nr_disp >= THROTL_QUANTUM) break; } return nr_disp; } /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @t: the pending_timer member of the throtl_service_queue being serviced * * This timer is armed when a child throtl_grp with active bio's become * pending and queued on the service_queue's pending_tree and expires when * the first child throtl_grp should be dispatched. This function * dispatches bio's from the children throtl_grps to the parent * service_queue. * * If the parent's parent is another throtl_grp, dispatching is propagated * by either arming its pending_timer or repeating dispatch directly. If * the top-level service_tree is reached, throtl_data->dispatch_work is * kicked so that the ready bio's are issued. */ static void throtl_pending_timer_fn(struct timer_list *t) { struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); struct throtl_service_queue *parent_sq; struct request_queue *q; bool dispatched; int ret; /* throtl_data may be gone, so figure out request queue by blkg */ if (tg) q = tg->pd.blkg->q; else q = td->queue; spin_lock_irq(&q->queue_lock); if (!q->root_blkg) goto out_unlock; again: parent_sq = sq->parent_sq; dispatched = false; while (true) { throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", sq->nr_queued[READ] + sq->nr_queued[WRITE], sq->nr_queued[READ], sq->nr_queued[WRITE]); ret = throtl_select_dispatch(sq); if (ret) { throtl_log(sq, "bios disp=%u", ret); dispatched = true; } if (throtl_schedule_next_dispatch(sq, false)) break; /* this dispatch windows is still open, relax and repeat */ spin_unlock_irq(&q->queue_lock); cpu_relax(); spin_lock_irq(&q->queue_lock); } if (!dispatched) goto out_unlock; if (parent_sq) { /* @parent_sq is another throl_grp, propagate dispatch */ if (tg->flags & THROTL_TG_WAS_EMPTY) { tg_update_disptime(tg); if (!throtl_schedule_next_dispatch(parent_sq, false)) { /* window is already open, repeat dispatching */ sq = parent_sq; tg = sq_to_tg(sq); goto again; } } } else { /* reached the top-level, queue issuing */ queue_work(kthrotld_workqueue, &td->dispatch_work); } out_unlock: spin_unlock_irq(&q->queue_lock); } /** * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work * @work: work item being executed * * This function is queued for execution when bios reach the bio_lists[] * of throtl_data->service_queue. Those bios are ready and issued by this * function. */ static void blk_throtl_dispatch_work_fn(struct work_struct *work) { struct throtl_data *td = container_of(work, struct throtl_data, dispatch_work); struct throtl_service_queue *td_sq = &td->service_queue; struct request_queue *q = td->queue; struct bio_list bio_list_on_stack; struct bio *bio; struct blk_plug plug; int rw; bio_list_init(&bio_list_on_stack); spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(&q->queue_lock); if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) submit_bio_noacct_nocheck(bio); blk_finish_plug(&plug); } } static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); u64 v = *(u64 *)((void *)tg + off); if (v == U64_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); unsigned int v = *(unsigned int *)((void *)tg + off); if (v == UINT_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } static int tg_print_conf_u64(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static int tg_print_conf_uint(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static void tg_conf_updated(struct throtl_grp *tg, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); rcu_read_lock(); /* * Update has_rules[] flags for the updated tg's subtree. A tg is * considered to have rules if either the tg itself or any of its * ancestors has rules. This identifies groups without any * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ blkg_for_each_descendant_pre(blkg, pos_css, global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg); tg_update_has_rules(this_tg); /* ignore root/second level */ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || !blkg->parent->parent) continue; } rcu_read_unlock(); /* * We're already holding queue_lock and know @tg is valid. Let's * apply the new config directly. * * Restart the slices for both READ and WRITES. It might happen * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ throtl_start_new_slice(tg, READ, false); throtl_start_new_slice(tg, WRITE, false); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); throtl_schedule_next_dispatch(sq->parent_sq, true); } } static int blk_throtl_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct throtl_data *td; int ret; td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); /* * Freeze queue before activating policy, to synchronize with IO path, * which is protected by 'q_usage_counter'. */ blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); q->td = td; td->queue = q; /* activate policy */ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { q->td = NULL; kfree(td); goto out; } if (blk_queue_nonrot(q)) td->throtl_slice = DFL_THROTL_SLICE_SSD; else td->throtl_slice = DFL_THROTL_SLICE_HD; td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); out: blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); return ret; } static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; int ret; u64 v; blkg_conf_init(&ctx, buf); ret = blkg_conf_open_bdev(&ctx); if (ret) goto out_finish; if (!blk_throtl_activated(ctx.bdev->bd_queue)) { ret = blk_throtl_init(ctx.bdev->bd_disk); if (ret) goto out_finish; } ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; ret = -EINVAL; if (sscanf(ctx.body, "%llu", &v) != 1) goto out_finish; if (!v) v = U64_MAX; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; else *(unsigned int *)((void *)tg + of_cft(of)->private) = v; tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); return ret ?: nbytes; } static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return tg_set_conf(of, buf, nbytes, off, true); } static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return tg_set_conf(of, buf, nbytes, off, false); } static int tg_print_rwstat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample sum; blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); } static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_rwstat_recursive, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.io_service_bytes", .private = offsetof(struct throtl_grp, stat_bytes), .seq_show = tg_print_rwstat, }, { .name = "throttle.io_service_bytes_recursive", .private = offsetof(struct throtl_grp, stat_bytes), .seq_show = tg_print_rwstat_recursive, }, { .name = "throttle.io_serviced", .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced_recursive", .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat_recursive, }, { } /* terminate */ }; static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); const char *dname = blkg_dev_name(pd->blkg); u64 bps_dft; unsigned int iops_dft; if (!dname) return 0; bps_dft = U64_MAX; iops_dft = UINT_MAX; if (tg->bps[READ] == bps_dft && tg->bps[WRITE] == bps_dft && tg->iops[READ] == iops_dft && tg->iops[WRITE] == iops_dft) return 0; seq_printf(sf, "%s", dname); if (tg->bps[READ] == U64_MAX) seq_printf(sf, " rbps=max"); else seq_printf(sf, " rbps=%llu", tg->bps[READ]); if (tg->bps[WRITE] == U64_MAX) seq_printf(sf, " wbps=max"); else seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); if (tg->iops[READ] == UINT_MAX) seq_printf(sf, " riops=max"); else seq_printf(sf, " riops=%u", tg->iops[READ]); if (tg->iops[WRITE] == UINT_MAX) seq_printf(sf, " wiops=max"); else seq_printf(sf, " wiops=%u", tg->iops[WRITE]); seq_printf(sf, "\n"); return 0; } static int tg_print_limit(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static ssize_t tg_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; int ret; blkg_conf_init(&ctx, buf); ret = blkg_conf_open_bdev(&ctx); if (ret) goto out_finish; if (!blk_throtl_activated(ctx.bdev->bd_queue)) { ret = blk_throtl_init(ctx.bdev->bd_disk); if (ret) goto out_finish; } ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); v[0] = tg->bps[READ]; v[1] = tg->bps[WRITE]; v[2] = tg->iops[READ]; v[3] = tg->iops[WRITE]; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; u64 val = U64_MAX; int len; if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) break; if (tok[0] == '\0') break; ctx.body += len; ret = -EINVAL; p = tok; strsep(&p, "="); if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) goto out_finish; ret = -ERANGE; if (!val) goto out_finish; ret = -EINVAL; if (!strcmp(tok, "rbps")) v[0] = val; else if (!strcmp(tok, "wbps")) v[1] = val; else if (!strcmp(tok, "riops")) v[2] = min_t(u64, val, UINT_MAX); else if (!strcmp(tok, "wiops")) v[3] = min_t(u64, val, UINT_MAX); else goto out_finish; } tg->bps[READ] = v[0]; tg->bps[WRITE] = v[1]; tg->iops[READ] = v[2]; tg->iops[WRITE] = v[3]; tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); return ret ?: nbytes; } static struct cftype throtl_files[] = { { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = tg_print_limit, .write = tg_set_limit, }, { } /* terminate */ }; static void throtl_shutdown_wq(struct request_queue *q) { struct throtl_data *td = q->td; cancel_work_sync(&td->dispatch_work); } static void tg_flush_bios(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; if (tg->flags & THROTL_TG_CANCELING) return; /* * Set the flag to make sure throtl_pending_timer_fn() won't * stop until all throttled bios are dispatched. */ tg->flags |= THROTL_TG_CANCELING; /* * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup * will be inserted to service queue without THROTL_TG_PENDING * set in tg_update_disptime below. Then IO dispatched from * child in tg_dispatch_one_bio will trigger double insertion * and corrupt the tree. */ if (!(tg->flags & THROTL_TG_PENDING)) return; /* * Update disptime after setting the above flag to make sure * throtl_select_dispatch() won't exit without dispatching. */ tg_update_disptime(tg); throtl_schedule_pending_timer(sq, jiffies + 1); } static void throtl_pd_offline(struct blkg_policy_data *pd) { tg_flush_bios(pd_to_tg(pd)); } struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; void blk_throtl_cancel_bios(struct gendisk *disk) { struct request_queue *q = disk->queue; struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; if (!blk_throtl_activated(q)) return; spin_lock_irq(&q->queue_lock); /* * queue_lock is held, rcu lock is not needed here technically. * However, rcu lock is still held to emphasize that following * path need RCU protection and to prevent warning from lockdep. */ rcu_read_lock(); blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { /* * disk_release will call pd_offline_fn to cancel bios. * However, disk_release can't be called if someone get * the refcount of device and issued bios which are * inflight after del_gendisk. * Cancel bios here to ensure no bios are inflight after * del_gendisk. */ tg_flush_bios(blkg_to_tg(blkg)); } rcu_read_unlock(); spin_unlock_irq(&q->queue_lock); } static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) { /* throtl is FIFO - if bios are already queued, should queue */ if (tg->service_queue.nr_queued[rw]) return false; return tg_may_dispatch(tg, bio, NULL); } static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) { if (!bio_flagged(bio, BIO_BPS_THROTTLED)) tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); tg->carryover_ios[rw]--; } bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; struct throtl_data *td = tg->td; rcu_read_lock(); spin_lock_irq(&q->queue_lock); sq = &tg->service_queue; while (true) { if (tg_within_limit(tg, bio, rw)) { /* within limits, let's charge and dispatch directly */ throtl_charge_bio(tg, bio); /* * We need to trim slice even when bios are not being * queued otherwise it might happen that a bio is not * queued for a long time and slice keeps on extending * and trim is not called for a long time. Now if limits * are reduced suddenly we take into account all the IO * dispatched so far at new low rate and * newly queued * IO gets a really long dispatch time. * * So keep on trimming slice even if bio is not queued. */ throtl_trim_slice(tg, rw); } else if (bio_issue_as_root_blkg(bio)) { /* * IOs which may cause priority inversions are * dispatched directly, even if they're over limit. * Debts are handled by carryover_bytes/ios while * calculating wait time. */ tg_dispatch_in_debt(tg, bio, rw); } else { /* if above limits, break to queue */ break; } /* * @bio passed through this layer without being throttled. * Climb up the ladder. If we're already at the top, it * can be executed directly. */ qn = &tg->qnode_on_parent[rw]; sq = sq->parent_sq; tg = sq_to_tg(sq); if (!tg) { bio_set_flag(bio, BIO_BPS_THROTTLED); goto out_unlock; } } /* out-of-limit, queue to @tg */ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', tg->bytes_disp[rw], bio->bi_iter.bi_size, tg_bps_limit(tg, rw), tg->io_disp[rw], tg_iops_limit(tg, rw), sq->nr_queued[READ], sq->nr_queued[WRITE]); td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; /* * Update @tg's dispatch time and force schedule dispatch if @tg * was empty before @bio. The forced scheduling isn't likely to * cause undue delay as @bio is likely to be dispatched directly if * its @tg's disptime is not in the future. */ if (tg->flags & THROTL_TG_WAS_EMPTY) { tg_update_disptime(tg); throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); } out_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); return throttled; } void blk_throtl_exit(struct gendisk *disk) { struct request_queue *q = disk->queue; if (!blk_throtl_activated(q)) return; del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(disk, &blkcg_policy_throtl); kfree(q->td); } static int __init throtl_init(void) { kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); if (!kthrotld_workqueue) panic("Failed to create kthrotld\n"); return blkcg_policy_register(&blkcg_policy_throtl); } module_init(throtl_init);
2 2 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 // SPDX-License-Identifier: GPL-2.0-or-later #include <net/genetlink.h> #include "br_private.h" #include "br_private_cfm.h" static const struct nla_policy br_cfm_mep_create_policy[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1] = { [IFLA_BRIDGE_CFM_MEP_CREATE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX] = { .type = NLA_U32 }, }; static const struct nla_policy br_cfm_mep_delete_policy[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1] = { [IFLA_BRIDGE_CFM_MEP_DELETE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE] = { .type = NLA_U32 }, }; static const struct nla_policy br_cfm_mep_config_policy[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1] = { [IFLA_BRIDGE_CFM_MEP_CONFIG_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC] = NLA_POLICY_ETH_ADDR, [IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL] = NLA_POLICY_MAX(NLA_U32, 7), [IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF), }; static const struct nla_policy br_cfm_cc_config_policy[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_CONFIG_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID] = { .type = NLA_BINARY, .len = CFM_MAID_LENGTH }, }; static const struct nla_policy br_cfm_cc_peer_mep_policy[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_PEER_MEP_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_PEER_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF), }; static const struct nla_policy br_cfm_cc_rdi_policy[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_RDI_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_RDI_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_RDI_RDI] = { .type = NLA_U32 }, }; static const struct nla_policy br_cfm_cc_ccm_tx_policy[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_CCM_TX_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC] = NLA_POLICY_ETH_ADDR, [IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE] = { .type = NLA_U8 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE] = { .type = NLA_U8 }, }; static const struct nla_policy br_cfm_policy[IFLA_BRIDGE_CFM_MAX + 1] = { [IFLA_BRIDGE_CFM_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_CREATE] = NLA_POLICY_NESTED(br_cfm_mep_create_policy), [IFLA_BRIDGE_CFM_MEP_DELETE] = NLA_POLICY_NESTED(br_cfm_mep_delete_policy), [IFLA_BRIDGE_CFM_MEP_CONFIG] = NLA_POLICY_NESTED(br_cfm_mep_config_policy), [IFLA_BRIDGE_CFM_CC_CONFIG] = NLA_POLICY_NESTED(br_cfm_cc_config_policy), [IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD] = NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy), [IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE] = NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy), [IFLA_BRIDGE_CFM_CC_RDI] = NLA_POLICY_NESTED(br_cfm_cc_rdi_policy), [IFLA_BRIDGE_CFM_CC_CCM_TX] = NLA_POLICY_NESTED(br_cfm_cc_ccm_tx_policy), }; static int br_mep_create_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1]; struct br_cfm_mep_create create; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CREATE_MAX, attr, br_cfm_mep_create_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]) { NL_SET_ERR_MSG_MOD(extack, "Missing DOMAIN attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]) { NL_SET_ERR_MSG_MOD(extack, "Missing DIRECTION attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]) { NL_SET_ERR_MSG_MOD(extack, "Missing IFINDEX attribute"); return -EINVAL; } memset(&create, 0, sizeof(create)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]); create.domain = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]); create.direction = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]); create.ifindex = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]); return br_cfm_mep_create(br, instance, &create, extack); } static int br_mep_delete_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1]; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_DELETE_MAX, attr, br_cfm_mep_delete_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]); return br_cfm_mep_delete(br, instance, extack); } static int br_mep_config_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1]; struct br_cfm_mep_config config; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CONFIG_MAX, attr, br_cfm_mep_config_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC]) { NL_SET_ERR_MSG_MOD(extack, "Missing UNICAST_MAC attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]) { NL_SET_ERR_MSG_MOD(extack, "Missing MDLEVEL attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]) { NL_SET_ERR_MSG_MOD(extack, "Missing MEPID attribute"); return -EINVAL; } memset(&config, 0, sizeof(config)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]); nla_memcpy(&config.unicast_mac.addr, tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC], sizeof(config.unicast_mac.addr)); config.mdlevel = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]); config.mepid = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]); return br_cfm_mep_config_set(br, instance, &config, extack); } static int br_cc_config_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1]; struct br_cfm_cc_config config; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CONFIG_MAX, attr, br_cfm_cc_config_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]) { NL_SET_ERR_MSG_MOD(extack, "Missing ENABLE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]) { NL_SET_ERR_MSG_MOD(extack, "Missing INTERVAL attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID]) { NL_SET_ERR_MSG_MOD(extack, "Missing MAID attribute"); return -EINVAL; } memset(&config, 0, sizeof(config)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]); config.enable = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]); config.exp_interval = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]); nla_memcpy(&config.exp_maid.data, tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID], sizeof(config.exp_maid.data)); return br_cfm_cc_config_set(br, instance, &config, extack); } static int br_cc_peer_mep_add_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1]; u32 instance, peer_mep_id; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr, br_cfm_cc_peer_mep_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) { NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]); peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]); return br_cfm_cc_peer_mep_add(br, instance, peer_mep_id, extack); } static int br_cc_peer_mep_remove_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1]; u32 instance, peer_mep_id; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr, br_cfm_cc_peer_mep_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) { NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]); peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]); return br_cfm_cc_peer_mep_remove(br, instance, peer_mep_id, extack); } static int br_cc_rdi_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1]; u32 instance, rdi; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_RDI_MAX, attr, br_cfm_cc_rdi_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]) { NL_SET_ERR_MSG_MOD(extack, "Missing RDI attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]); rdi = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]); return br_cfm_cc_rdi_set(br, instance, rdi, extack); } static int br_cc_ccm_tx_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1]; struct br_cfm_cc_ccm_tx_info tx_info; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CCM_TX_MAX, attr, br_cfm_cc_ccm_tx_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC]) { NL_SET_ERR_MSG_MOD(extack, "Missing DMAC attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]) { NL_SET_ERR_MSG_MOD(extack, "Missing SEQ_NO_UPDATE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]) { NL_SET_ERR_MSG_MOD(extack, "Missing PERIOD attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]) { NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]) { NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV_VALUE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]) { NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]) { NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV_VALUE attribute"); return -EINVAL; } memset(&tx_info, 0, sizeof(tx_info)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]); nla_memcpy(&tx_info.dmac.addr, tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC], sizeof(tx_info.dmac.addr)); tx_info.seq_no_update = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]); tx_info.period = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]); tx_info.if_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]); tx_info.if_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]); tx_info.port_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]); tx_info.port_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]); return br_cfm_cc_ccm_tx(br, instance, &tx_info, extack); } int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MAX + 1]; int err; /* When this function is called for a port then the br pointer is * invalid, therefor set the br to point correctly */ if (p) br = p->br; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MAX, attr, br_cfm_policy, extack); if (err) return err; if (tb[IFLA_BRIDGE_CFM_MEP_CREATE]) { err = br_mep_create_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CREATE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_MEP_DELETE]) { err = br_mep_delete_parse(br, tb[IFLA_BRIDGE_CFM_MEP_DELETE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_MEP_CONFIG]) { err = br_mep_config_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CONFIG], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_CONFIG]) { err = br_cc_config_parse(br, tb[IFLA_BRIDGE_CFM_CC_CONFIG], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD]) { err = br_cc_peer_mep_add_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE]) { err = br_cc_peer_mep_remove_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_RDI]) { err = br_cc_rdi_parse(br, tb[IFLA_BRIDGE_CFM_CC_RDI], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_CCM_TX]) { err = br_cc_ccm_tx_parse(br, tb[IFLA_BRIDGE_CFM_CC_CCM_TX], extack); if (err) return err; } return 0; } int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; struct nlattr *tb; hlist_for_each_entry_rcu(mep, &br->mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN, mep->create.domain)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION, mep->create.direction)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX, mep->create.ifindex)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC, sizeof(mep->config.unicast_mac.addr), mep->config.unicast_mac.addr)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL, mep->config.mdlevel)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID, mep->config.mepid)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE, mep->cc_config.enable)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL, mep->cc_config.exp_interval)) goto nla_put_failure; if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID, sizeof(mep->cc_config.exp_maid.data), mep->cc_config.exp_maid.data)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_RDI_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_RDI, mep->rdi)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC, sizeof(mep->cc_ccm_tx_info.dmac), mep->cc_ccm_tx_info.dmac.addr)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE, mep->cc_ccm_tx_info.seq_no_update)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD, mep->cc_ccm_tx_info.period)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV, mep->cc_ccm_tx_info.if_tlv)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE, mep->cc_ccm_tx_info.if_tlv_value)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV, mep->cc_ccm_tx_info.port_tlv)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE, mep->cc_ccm_tx_info.port_tlv_value)) goto nla_put_failure; nla_nest_end(skb, tb); hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_PEER_MEP_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_MEPID, peer_mep->mepid)) goto nla_put_failure; nla_nest_end(skb, tb); } } return 0; nla_put_failure: nla_nest_cancel(skb, tb); nla_info_failure: return -EMSGSIZE; } int br_cfm_status_fill_info(struct sk_buff *skb, struct net_bridge *br, bool getlink) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; struct nlattr *tb; hlist_for_each_entry_rcu(mep, &br->mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN, mep->status.opcode_unexp_seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN, mep->status.version_unexp_seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN, mep->status.rx_level_low_seen)) goto nla_put_failure; /* Only clear if this is a GETLINK */ if (getlink) { /* Clear all 'seen' indications */ mep->status.opcode_unexp_seen = false; mep->status.version_unexp_seen = false; mep->status.rx_level_low_seen = false; } nla_nest_end(skb, tb); hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID, peer_mep->mepid)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT, peer_mep->cc_status.ccm_defect)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI, peer_mep->cc_status.rdi)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE, peer_mep->cc_status.port_tlv_value)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE, peer_mep->cc_status.if_tlv_value)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN, peer_mep->cc_status.seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN, peer_mep->cc_status.tlv_seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN, peer_mep->cc_status.seq_unexp_seen)) goto nla_put_failure; if (getlink) { /* Only clear if this is a GETLINK */ /* Clear all 'seen' indications */ peer_mep->cc_status.seen = false; peer_mep->cc_status.tlv_seen = false; peer_mep->cc_status.seq_unexp_seen = false; } nla_nest_end(skb, tb); } } return 0; nla_put_failure: nla_nest_cancel(skb, tb); nla_info_failure: return -EMSGSIZE; }
609 608 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 // SPDX-License-Identifier: GPL-2.0 /* * property.c - Unified device property interface. * * Copyright (C) 2014, Intel Corporation * Authors: Rafael J. Wysocki <rafael.j.wysocki@intel.com> * Mika Westerberg <mika.westerberg@linux.intel.com> */ #include <linux/device.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kconfig.h> #include <linux/of.h> #include <linux/property.h> #include <linux/phy.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> struct fwnode_handle *__dev_fwnode(struct device *dev) { return IS_ENABLED(CONFIG_OF) && dev->of_node ? of_fwnode_handle(dev->of_node) : dev->fwnode; } EXPORT_SYMBOL_GPL(__dev_fwnode); const struct fwnode_handle *__dev_fwnode_const(const struct device *dev) { return IS_ENABLED(CONFIG_OF) && dev->of_node ? of_fwnode_handle(dev->of_node) : dev->fwnode; } EXPORT_SYMBOL_GPL(__dev_fwnode_const); /** * device_property_present - check if a property of a device is present * @dev: Device whose property is being checked * @propname: Name of the property * * Check if property @propname is present in the device firmware description. * * Return: true if property @propname is present. Otherwise, returns false. */ bool device_property_present(const struct device *dev, const char *propname) { return fwnode_property_present(dev_fwnode(dev), propname); } EXPORT_SYMBOL_GPL(device_property_present); /** * fwnode_property_present - check if a property of a firmware node is present * @fwnode: Firmware node whose property to check * @propname: Name of the property * * Return: true if property @propname is present. Otherwise, returns false. */ bool fwnode_property_present(const struct fwnode_handle *fwnode, const char *propname) { bool ret; if (IS_ERR_OR_NULL(fwnode)) return false; ret = fwnode_call_bool_op(fwnode, property_present, propname); if (ret) return ret; return fwnode_call_bool_op(fwnode->secondary, property_present, propname); } EXPORT_SYMBOL_GPL(fwnode_property_present); /** * device_property_read_bool - Return the value for a boolean property of a device * @dev: Device whose property is being checked * @propname: Name of the property * * Return if property @propname is true or false in the device firmware description. * * Return: true if property @propname is present. Otherwise, returns false. */ bool device_property_read_bool(const struct device *dev, const char *propname) { return fwnode_property_read_bool(dev_fwnode(dev), propname); } EXPORT_SYMBOL_GPL(device_property_read_bool); /** * fwnode_property_read_bool - Return the value for a boolean property of a firmware node * @fwnode: Firmware node whose property to check * @propname: Name of the property * * Return if property @propname is true or false in the firmware description. */ bool fwnode_property_read_bool(const struct fwnode_handle *fwnode, const char *propname) { bool ret; if (IS_ERR_OR_NULL(fwnode)) return false; ret = fwnode_call_bool_op(fwnode, property_read_bool, propname); if (ret) return ret; return fwnode_call_bool_op(fwnode->secondary, property_read_bool, propname); } EXPORT_SYMBOL_GPL(fwnode_property_read_bool); /** * device_property_read_u8_array - return a u8 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u8 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u8() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u8_array(const struct device *dev, const char *propname, u8 *val, size_t nval) { return fwnode_property_read_u8_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u8_array); /** * device_property_read_u16_array - return a u16 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u16 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u16() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u16_array(const struct device *dev, const char *propname, u16 *val, size_t nval) { return fwnode_property_read_u16_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u16_array); /** * device_property_read_u32_array - return a u32 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u32 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u32() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u32_array(const struct device *dev, const char *propname, u32 *val, size_t nval) { return fwnode_property_read_u32_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u32_array); /** * device_property_read_u64_array - return a u64 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u64 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u64() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u64_array(const struct device *dev, const char *propname, u64 *val, size_t nval) { return fwnode_property_read_u64_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u64_array); /** * device_property_read_string_array - return a string array property of device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of string properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_string_array_count() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values read on success if @val is non-NULL, * number of values available on success if @val is NULL, * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not an array of strings, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_string_array(const struct device *dev, const char *propname, const char **val, size_t nval) { return fwnode_property_read_string_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_string_array); /** * device_property_read_string - return a string property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The value is stored here * * Function reads property @propname from the device firmware description and * stores the value into @val if found. The value is checked to be a string. * * Return: %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property type is not a string. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_string(const struct device *dev, const char *propname, const char **val) { return fwnode_property_read_string(dev_fwnode(dev), propname, val); } EXPORT_SYMBOL_GPL(device_property_read_string); /** * device_property_match_string - find a string in an array and return index * @dev: Device to get the property of * @propname: Name of the property holding the array * @string: String to look for * * Find a given string in a string array and if it is found return the * index back. * * Return: index, starting from %0, if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of strings, * %-ENXIO if no suitable firmware interface is present. */ int device_property_match_string(const struct device *dev, const char *propname, const char *string) { return fwnode_property_match_string(dev_fwnode(dev), propname, string); } EXPORT_SYMBOL_GPL(device_property_match_string); static int fwnode_property_read_int_array(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, size_t nval) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -EINVAL; ret = fwnode_call_int_op(fwnode, property_read_int_array, propname, elem_size, val, nval); if (ret != -EINVAL) return ret; return fwnode_call_int_op(fwnode->secondary, property_read_int_array, propname, elem_size, val, nval); } /** * fwnode_property_read_u8_array - return a u8 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u8 properties with @propname from @fwnode and stores them to * @val if found. * * It's recommended to call fwnode_property_count_u8() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode, const char *propname, u8 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u8), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u8_array); /** * fwnode_property_read_u16_array - return a u16 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u16 properties with @propname from @fwnode and store them to * @val if found. * * It's recommended to call fwnode_property_count_u16() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u16_array(const struct fwnode_handle *fwnode, const char *propname, u16 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u16), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u16_array); /** * fwnode_property_read_u32_array - return a u32 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u32 properties with @propname from @fwnode store them to * @val if found. * * It's recommended to call fwnode_property_count_u32() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u32_array(const struct fwnode_handle *fwnode, const char *propname, u32 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u32), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u32_array); /** * fwnode_property_read_u64_array - return a u64 array property firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u64 properties with @propname from @fwnode and store them to * @val if found. * * It's recommended to call fwnode_property_count_u64() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u64_array(const struct fwnode_handle *fwnode, const char *propname, u64 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u64), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u64_array); /** * fwnode_property_read_string_array - return string array property of a node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an string list property @propname from the given firmware node and store * them to @val if found. * * It's recommended to call fwnode_property_string_array_count() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values read on success if @val is non-NULL, * number of values available on success if @val is NULL, * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not an array of strings, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_string_array(const struct fwnode_handle *fwnode, const char *propname, const char **val, size_t nval) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -EINVAL; ret = fwnode_call_int_op(fwnode, property_read_string_array, propname, val, nval); if (ret != -EINVAL) return ret; return fwnode_call_int_op(fwnode->secondary, property_read_string_array, propname, val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_string_array); /** * fwnode_property_read_string - return a string property of a firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The value is stored here * * Read property @propname from the given firmware node and store the value into * @val if found. The value is checked to be a string. * * Return: %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not a string, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_string(const struct fwnode_handle *fwnode, const char *propname, const char **val) { int ret = fwnode_property_read_string_array(fwnode, propname, val, 1); return ret < 0 ? ret : 0; } EXPORT_SYMBOL_GPL(fwnode_property_read_string); /** * fwnode_property_match_string - find a string in an array and return index * @fwnode: Firmware node to get the property of * @propname: Name of the property holding the array * @string: String to look for * * Find a given string in a string array and if it is found return the * index back. * * Return: index, starting from %0, if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of strings, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_match_string(const struct fwnode_handle *fwnode, const char *propname, const char *string) { const char **values; int nval, ret; nval = fwnode_property_string_array_count(fwnode, propname); if (nval < 0) return nval; if (nval == 0) return -ENODATA; values = kcalloc(nval, sizeof(*values), GFP_KERNEL); if (!values) return -ENOMEM; ret = fwnode_property_read_string_array(fwnode, propname, values, nval); if (ret < 0) goto out_free; ret = match_string(values, nval, string); if (ret < 0) ret = -ENODATA; out_free: kfree(values); return ret; } EXPORT_SYMBOL_GPL(fwnode_property_match_string); /** * fwnode_property_match_property_string - find a property string value in an array and return index * @fwnode: Firmware node to get the property of * @propname: Name of the property holding the string value * @array: String array to search in * @n: Size of the @array * * Find a property string value in a given @array and if it is found return * the index back. * * Return: index, starting from %0, if the string value was found in the @array (success), * %-ENOENT when the string value was not found in the @array, * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not a string, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_match_property_string(const struct fwnode_handle *fwnode, const char *propname, const char * const *array, size_t n) { const char *string; int ret; ret = fwnode_property_read_string(fwnode, propname, &string); if (ret) return ret; ret = match_string(array, n, string); if (ret < 0) ret = -ENOENT; return ret; } EXPORT_SYMBOL_GPL(fwnode_property_match_property_string); /** * fwnode_property_get_reference_args() - Find a reference with arguments * @fwnode: Firmware node where to look for the reference * @prop: The name of the property * @nargs_prop: The name of the property telling the number of * arguments in the referred node. NULL if @nargs is known, * otherwise @nargs is ignored. Only relevant on OF. * @nargs: Number of arguments. Ignored if @nargs_prop is non-NULL. * @index: Index of the reference, from zero onwards. * @args: Result structure with reference and integer arguments. * May be NULL. * * Obtain a reference based on a named property in an fwnode, with * integer arguments. * * The caller is responsible for calling fwnode_handle_put() on the returned * @args->fwnode pointer. * * Return: %0 on success * %-ENOENT when the index is out of bounds, the index has an empty * reference or the property was not found * %-EINVAL on parse error */ int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode, const char *prop, const char *nargs_prop, unsigned int nargs, unsigned int index, struct fwnode_reference_args *args) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -ENOENT; ret = fwnode_call_int_op(fwnode, get_reference_args, prop, nargs_prop, nargs, index, args); if (ret == 0) return ret; if (IS_ERR_OR_NULL(fwnode->secondary)) return ret; return fwnode_call_int_op(fwnode->secondary, get_reference_args, prop, nargs_prop, nargs, index, args); } EXPORT_SYMBOL_GPL(fwnode_property_get_reference_args); /** * fwnode_find_reference - Find named reference to a fwnode_handle * @fwnode: Firmware node where to look for the reference * @name: The name of the reference * @index: Index of the reference * * @index can be used when the named reference holds a table of references. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: a pointer to the reference fwnode, when found. Otherwise, * returns an error pointer. */ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode, const char *name, unsigned int index) { struct fwnode_reference_args args; int ret; ret = fwnode_property_get_reference_args(fwnode, name, NULL, 0, index, &args); return ret ? ERR_PTR(ret) : args.fwnode; } EXPORT_SYMBOL_GPL(fwnode_find_reference); /** * fwnode_get_name - Return the name of a node * @fwnode: The firmware node * * Return: a pointer to the node name, or %NULL. */ const char *fwnode_get_name(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_name); } EXPORT_SYMBOL_GPL(fwnode_get_name); /** * fwnode_get_name_prefix - Return the prefix of node for printing purposes * @fwnode: The firmware node * * Return: the prefix of a node, intended to be printed right before the node. * The prefix works also as a separator between the nodes. */ const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_name_prefix); } /** * fwnode_name_eq - Return true if node name is equal * @fwnode: The firmware node * @name: The name to which to compare the node name * * Compare the name provided as an argument to the name of the node, stopping * the comparison at either NUL or '@' character, whichever comes first. This * function is generally used for comparing node names while ignoring the * possible unit address of the node. * * Return: true if the node name matches with the name provided in the @name * argument, false otherwise. */ bool fwnode_name_eq(const struct fwnode_handle *fwnode, const char *name) { const char *node_name; ptrdiff_t len; node_name = fwnode_get_name(fwnode); if (!node_name) return false; len = strchrnul(node_name, '@') - node_name; return str_has_prefix(node_name, name) == len; } EXPORT_SYMBOL_GPL(fwnode_name_eq); /** * fwnode_get_parent - Return parent firwmare node * @fwnode: Firmware whose parent is retrieved * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: parent firmware node of the given node if possible or %NULL if no * parent was available. */ struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_parent); } EXPORT_SYMBOL_GPL(fwnode_get_parent); /** * fwnode_get_next_parent - Iterate to the node's parent * @fwnode: Firmware whose parent is retrieved * * This is like fwnode_get_parent() except that it drops the refcount * on the passed node, making it suitable for iterating through a * node's parents. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @fwnode * unconditionally. * * Return: parent firmware node of the given node if possible or %NULL if no * parent was available. */ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode) { struct fwnode_handle *parent = fwnode_get_parent(fwnode); fwnode_handle_put(fwnode); return parent; } EXPORT_SYMBOL_GPL(fwnode_get_next_parent); /** * fwnode_count_parents - Return the number of parents a node has * @fwnode: The node the parents of which are to be counted * * Return: the number of parents a node has. */ unsigned int fwnode_count_parents(const struct fwnode_handle *fwnode) { struct fwnode_handle *parent; unsigned int count = 0; fwnode_for_each_parent_node(fwnode, parent) count++; return count; } EXPORT_SYMBOL_GPL(fwnode_count_parents); /** * fwnode_get_nth_parent - Return an nth parent of a node * @fwnode: The node the parent of which is requested * @depth: Distance of the parent from the node * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the nth parent of a node. If there is no parent at the requested * @depth, %NULL is returned. If @depth is 0, the functionality is equivalent to * fwnode_handle_get(). For @depth == 1, it is fwnode_get_parent() and so on. */ struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwnode, unsigned int depth) { struct fwnode_handle *parent; if (depth == 0) return fwnode_handle_get(fwnode); fwnode_for_each_parent_node(fwnode, parent) { if (--depth == 0) return parent; } return NULL; } EXPORT_SYMBOL_GPL(fwnode_get_nth_parent); /** * fwnode_get_next_child_node - Return the next child node handle for a node * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the node's child nodes or a %NULL handle. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @child * unconditionally. */ struct fwnode_handle * fwnode_get_next_child_node(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { return fwnode_call_ptr_op(fwnode, get_next_child_node, child); } EXPORT_SYMBOL_GPL(fwnode_get_next_child_node); /** * fwnode_get_next_available_child_node - Return the next available child node handle for a node * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the node's child nodes or a %NULL handle. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @child * unconditionally. */ struct fwnode_handle * fwnode_get_next_available_child_node(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { struct fwnode_handle *next_child = child; if (IS_ERR_OR_NULL(fwnode)) return NULL; do { next_child = fwnode_get_next_child_node(fwnode, next_child); if (!next_child) return NULL; } while (!fwnode_device_is_available(next_child)); return next_child; } EXPORT_SYMBOL_GPL(fwnode_get_next_available_child_node); /** * device_get_next_child_node - Return the next child node handle for a device * @dev: Device to find the next child node for. * @child: Handle to one of the device's child nodes or a %NULL handle. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @child * unconditionally. */ struct fwnode_handle *device_get_next_child_node(const struct device *dev, struct fwnode_handle *child) { const struct fwnode_handle *fwnode = dev_fwnode(dev); struct fwnode_handle *next; if (IS_ERR_OR_NULL(fwnode)) return NULL; /* Try to find a child in primary fwnode */ next = fwnode_get_next_child_node(fwnode, child); if (next) return next; /* When no more children in primary, continue with secondary */ return fwnode_get_next_child_node(fwnode->secondary, child); } EXPORT_SYMBOL_GPL(device_get_next_child_node); /** * fwnode_get_named_child_node - Return first matching named child node handle * @fwnode: Firmware node to find the named child node for. * @childname: String to match child node name against. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_get_named_child_node(const struct fwnode_handle *fwnode, const char *childname) { return fwnode_call_ptr_op(fwnode, get_named_child_node, childname); } EXPORT_SYMBOL_GPL(fwnode_get_named_child_node); /** * device_get_named_child_node - Return first matching named child node handle * @dev: Device to find the named child node for. * @childname: String to match child node name against. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle *device_get_named_child_node(const struct device *dev, const char *childname) { return fwnode_get_named_child_node(dev_fwnode(dev), childname); } EXPORT_SYMBOL_GPL(device_get_named_child_node); /** * fwnode_handle_get - Obtain a reference to a device node * @fwnode: Pointer to the device node to obtain the reference to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the fwnode handle. */ struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode) { if (!fwnode_has_op(fwnode, get)) return fwnode; return fwnode_call_ptr_op(fwnode, get); } EXPORT_SYMBOL_GPL(fwnode_handle_get); /** * fwnode_device_is_available - check if a device is available for use * @fwnode: Pointer to the fwnode of the device. * * Return: true if device is available for use. Otherwise, returns false. * * For fwnode node types that don't implement the .device_is_available() * operation, this function returns true. */ bool fwnode_device_is_available(const struct fwnode_handle *fwnode) { if (IS_ERR_OR_NULL(fwnode)) return false; if (!fwnode_has_op(fwnode, device_is_available)) return true; return fwnode_call_bool_op(fwnode, device_is_available); } EXPORT_SYMBOL_GPL(fwnode_device_is_available); /** * device_get_child_node_count - return the number of child nodes for device * @dev: Device to count the child nodes for * * Return: the number of child nodes for a given device. */ unsigned int device_get_child_node_count(const struct device *dev) { struct fwnode_handle *child; unsigned int count = 0; device_for_each_child_node(dev, child) count++; return count; } EXPORT_SYMBOL_GPL(device_get_child_node_count); bool device_dma_supported(const struct device *dev) { return fwnode_call_bool_op(dev_fwnode(dev), device_dma_supported); } EXPORT_SYMBOL_GPL(device_dma_supported); enum dev_dma_attr device_get_dma_attr(const struct device *dev) { if (!fwnode_has_op(dev_fwnode(dev), device_get_dma_attr)) return DEV_DMA_NOT_SUPPORTED; return fwnode_call_int_op(dev_fwnode(dev), device_get_dma_attr); } EXPORT_SYMBOL_GPL(device_get_dma_attr); /** * fwnode_get_phy_mode - Get phy mode for given firmware node * @fwnode: Pointer to the given node * * The function gets phy interface string from property 'phy-mode' or * 'phy-connection-type', and return its index in phy_modes table, or errno in * error case. */ int fwnode_get_phy_mode(const struct fwnode_handle *fwnode) { const char *pm; int err, i; err = fwnode_property_read_string(fwnode, "phy-mode", &pm); if (err < 0) err = fwnode_property_read_string(fwnode, "phy-connection-type", &pm); if (err < 0) return err; for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++) if (!strcasecmp(pm, phy_modes(i))) return i; return -ENODEV; } EXPORT_SYMBOL_GPL(fwnode_get_phy_mode); /** * device_get_phy_mode - Get phy mode for given device * @dev: Pointer to the given device * * The function gets phy interface string from property 'phy-mode' or * 'phy-connection-type', and return its index in phy_modes table, or errno in * error case. */ int device_get_phy_mode(struct device *dev) { return fwnode_get_phy_mode(dev_fwnode(dev)); } EXPORT_SYMBOL_GPL(device_get_phy_mode); /** * fwnode_iomap - Maps the memory mapped IO for a given fwnode * @fwnode: Pointer to the firmware node * @index: Index of the IO range * * Return: a pointer to the mapped memory. */ void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index) { return fwnode_call_ptr_op(fwnode, iomap, index); } EXPORT_SYMBOL(fwnode_iomap); /** * fwnode_irq_get - Get IRQ directly from a fwnode * @fwnode: Pointer to the firmware node * @index: Zero-based index of the IRQ * * Return: Linux IRQ number on success. Negative errno on failure. */ int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index) { int ret; ret = fwnode_call_int_op(fwnode, irq_get, index); /* We treat mapping errors as invalid case */ if (ret == 0) return -EINVAL; return ret; } EXPORT_SYMBOL(fwnode_irq_get); /** * fwnode_irq_get_byname - Get IRQ from a fwnode using its name * @fwnode: Pointer to the firmware node * @name: IRQ name * * Description: * Find a match to the string @name in the 'interrupt-names' string array * in _DSD for ACPI, or of_node for Device Tree. Then get the Linux IRQ * number of the IRQ resource corresponding to the index of the matched * string. * * Return: Linux IRQ number on success, or negative errno otherwise. */ int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name) { int index; if (!name) return -EINVAL; index = fwnode_property_match_string(fwnode, "interrupt-names", name); if (index < 0) return index; return fwnode_irq_get(fwnode, index); } EXPORT_SYMBOL(fwnode_irq_get_byname); /** * fwnode_graph_get_next_endpoint - Get next endpoint firmware node * @fwnode: Pointer to the parent firmware node * @prev: Previous endpoint node or %NULL to get the first * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @prev * unconditionally. * * Return: an endpoint firmware node pointer or %NULL if no more endpoints * are available. */ struct fwnode_handle * fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { struct fwnode_handle *ep, *port_parent = NULL; const struct fwnode_handle *parent; /* * If this function is in a loop and the previous iteration returned * an endpoint from fwnode->secondary, then we need to use the secondary * as parent rather than @fwnode. */ if (prev) { port_parent = fwnode_graph_get_port_parent(prev); parent = port_parent; } else { parent = fwnode; } if (IS_ERR_OR_NULL(parent)) return NULL; ep = fwnode_call_ptr_op(parent, graph_get_next_endpoint, prev); if (ep) goto out_put_port_parent; ep = fwnode_graph_get_next_endpoint(parent->secondary, NULL); out_put_port_parent: fwnode_handle_put(port_parent); return ep; } EXPORT_SYMBOL_GPL(fwnode_graph_get_next_endpoint); /** * fwnode_graph_get_port_parent - Return the device fwnode of a port endpoint * @endpoint: Endpoint firmware node of the port * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the firmware node of the device the @endpoint belongs to. */ struct fwnode_handle * fwnode_graph_get_port_parent(const struct fwnode_handle *endpoint) { struct fwnode_handle *port, *parent; port = fwnode_get_parent(endpoint); parent = fwnode_call_ptr_op(port, graph_get_port_parent); fwnode_handle_put(port); return parent; } EXPORT_SYMBOL_GPL(fwnode_graph_get_port_parent); /** * fwnode_graph_get_remote_port_parent - Return fwnode of a remote device * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote device the @fwnode points to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_graph_get_remote_port_parent(const struct fwnode_handle *fwnode) { struct fwnode_handle *endpoint, *parent; endpoint = fwnode_graph_get_remote_endpoint(fwnode); parent = fwnode_graph_get_port_parent(endpoint); fwnode_handle_put(endpoint); return parent; } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port_parent); /** * fwnode_graph_get_remote_port - Return fwnode of a remote port * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote port the @fwnode points to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_graph_get_remote_port(const struct fwnode_handle *fwnode) { return fwnode_get_next_parent(fwnode_graph_get_remote_endpoint(fwnode)); } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port); /** * fwnode_graph_get_remote_endpoint - Return fwnode of a remote endpoint * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote endpoint the @fwnode points to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, graph_get_remote_endpoint); } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint); static bool fwnode_graph_remote_available(struct fwnode_handle *ep) { struct fwnode_handle *dev_node; bool available; dev_node = fwnode_graph_get_remote_port_parent(ep); available = fwnode_device_is_available(dev_node); fwnode_handle_put(dev_node); return available; } /** * fwnode_graph_get_endpoint_by_id - get endpoint by port and endpoint numbers * @fwnode: parent fwnode_handle containing the graph * @port: identifier of the port node * @endpoint: identifier of the endpoint node under the port node * @flags: fwnode lookup flags * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the fwnode handle of the local endpoint corresponding the port and * endpoint IDs or %NULL if not found. * * If FWNODE_GRAPH_ENDPOINT_NEXT is passed in @flags and the specified endpoint * has not been found, look for the closest endpoint ID greater than the * specified one and return the endpoint that corresponds to it, if present. * * Does not return endpoints that belong to disabled devices or endpoints that * are unconnected, unless FWNODE_GRAPH_DEVICE_DISABLED is passed in @flags. */ struct fwnode_handle * fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode, u32 port, u32 endpoint, unsigned long flags) { struct fwnode_handle *ep, *best_ep = NULL; unsigned int best_ep_id = 0; bool endpoint_next = flags & FWNODE_GRAPH_ENDPOINT_NEXT; bool enabled_only = !(flags & FWNODE_GRAPH_DEVICE_DISABLED); fwnode_graph_for_each_endpoint(fwnode, ep) { struct fwnode_endpoint fwnode_ep = { 0 }; int ret; if (enabled_only && !fwnode_graph_remote_available(ep)) continue; ret = fwnode_graph_parse_endpoint(ep, &fwnode_ep); if (ret < 0) continue; if (fwnode_ep.port != port) continue; if (fwnode_ep.id == endpoint) return ep; if (!endpoint_next) continue; /* * If the endpoint that has just been found is not the first * matching one and the ID of the one found previously is closer * to the requested endpoint ID, skip it. */ if (fwnode_ep.id < endpoint || (best_ep && best_ep_id < fwnode_ep.id)) continue; fwnode_handle_put(best_ep); best_ep = fwnode_handle_get(ep); best_ep_id = fwnode_ep.id; } return best_ep; } EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_by_id); /** * fwnode_graph_get_endpoint_count - Count endpoints on a device node * @fwnode: The node related to a device * @flags: fwnode lookup flags * Count endpoints in a device node. * * If FWNODE_GRAPH_DEVICE_DISABLED flag is specified, also unconnected endpoints * and endpoints connected to disabled devices are counted. */ unsigned int fwnode_graph_get_endpoint_count(const struct fwnode_handle *fwnode, unsigned long flags) { struct fwnode_handle *ep; unsigned int count = 0; fwnode_graph_for_each_endpoint(fwnode, ep) { if (flags & FWNODE_GRAPH_DEVICE_DISABLED || fwnode_graph_remote_available(ep)) count++; } return count; } EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_count); /** * fwnode_graph_parse_endpoint - parse common endpoint node properties * @fwnode: pointer to endpoint fwnode_handle * @endpoint: pointer to the fwnode endpoint data structure * * Parse @fwnode representing a graph endpoint node and store the * information in @endpoint. The caller must hold a reference to * @fwnode. */ int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint) { memset(endpoint, 0, sizeof(*endpoint)); return fwnode_call_int_op(fwnode, graph_parse_endpoint, endpoint); } EXPORT_SYMBOL(fwnode_graph_parse_endpoint); const void *device_get_match_data(const struct device *dev) { return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data, dev); } EXPORT_SYMBOL_GPL(device_get_match_data); static unsigned int fwnode_graph_devcon_matches(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match, void **matches, unsigned int matches_len) { struct fwnode_handle *node; struct fwnode_handle *ep; unsigned int count = 0; void *ret; fwnode_graph_for_each_endpoint(fwnode, ep) { if (matches && count >= matches_len) { fwnode_handle_put(ep); break; } node = fwnode_graph_get_remote_port_parent(ep); if (!fwnode_device_is_available(node)) { fwnode_handle_put(node); continue; } ret = match(node, con_id, data); fwnode_handle_put(node); if (ret) { if (matches) matches[count] = ret; count++; } } return count; } static unsigned int fwnode_devcon_matches(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match, void **matches, unsigned int matches_len) { struct fwnode_handle *node; unsigned int count = 0; unsigned int i; void *ret; for (i = 0; ; i++) { if (matches && count >= matches_len) break; node = fwnode_find_reference(fwnode, con_id, i); if (IS_ERR(node)) break; ret = match(node, NULL, data); fwnode_handle_put(node); if (ret) { if (matches) matches[count] = ret; count++; } } return count; } /** * fwnode_connection_find_match - Find connection from a device node * @fwnode: Device node with the connection * @con_id: Identifier for the connection * @data: Data for the match function * @match: Function to check and convert the connection description * * Find a connection with unique identifier @con_id between @fwnode and another * device node. @match will be used to convert the connection description to * data the caller is expecting to be returned. */ void *fwnode_connection_find_match(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match) { unsigned int count; void *ret; if (!fwnode || !match) return NULL; count = fwnode_graph_devcon_matches(fwnode, con_id, data, match, &ret, 1); if (count) return ret; count = fwnode_devcon_matches(fwnode, con_id, data, match, &ret, 1); return count ? ret : NULL; } EXPORT_SYMBOL_GPL(fwnode_connection_find_match); /** * fwnode_connection_find_matches - Find connections from a device node * @fwnode: Device node with the connection * @con_id: Identifier for the connection * @data: Data for the match function * @match: Function to check and convert the connection description * @matches: (Optional) array of pointers to fill with matches * @matches_len: Length of @matches * * Find up to @matches_len connections with unique identifier @con_id between * @fwnode and other device nodes. @match will be used to convert the * connection description to data the caller is expecting to be returned * through the @matches array. * * If @matches is %NULL @matches_len is ignored and the total number of resolved * matches is returned. * * Return: Number of matches resolved, or negative errno. */ int fwnode_connection_find_matches(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match, void **matches, unsigned int matches_len) { unsigned int count_graph; unsigned int count_ref; if (!fwnode || !match) return -EINVAL; count_graph = fwnode_graph_devcon_matches(fwnode, con_id, data, match, matches, matches_len); if (matches) { matches += count_graph; matches_len -= count_graph; } count_ref = fwnode_devcon_matches(fwnode, con_id, data, match, matches, matches_len); return count_graph + count_ref; } EXPORT_SYMBOL_GPL(fwnode_connection_find_matches);
523 521 522 6727 6722 43 6724 6725 6726 3627 3629 3625 6721 6719 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 // SPDX-License-Identifier: GPL-2.0 /* * mm/pgtable-generic.c * * Generic pgtable methods declared in linux/pgtable.h * * Copyright (C) 2010 Linus Torvalds */ #include <linux/pagemap.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mm_inline.h> #include <asm/pgalloc.h> #include <asm/tlb.h> /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but * very seldom) called out from the p?d_none_or_clear_bad macros. */ void pgd_clear_bad(pgd_t *pgd) { pgd_ERROR(*pgd); pgd_clear(pgd); } #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *p4d) { p4d_ERROR(*p4d); p4d_clear(p4d); } #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } #endif /* * Note that the pmd variant below can't be stub'ed out just as for p4d/pud * above. pmd folding is special and typically pmd_* macros refer to upper * level even when folded */ void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); pmd_clear(pmd); } #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* * Only sets the access flags (dirty, accessed), as well as write * permission. Furthermore, we know it always gets set to a "more * permissive" setting, which allows most architectures to optimize * this. We return whether the PTE actually changed, which in turn * instructs the caller to do things like update__mmu_cache. This * used to be done in the caller, but sparc needs minor faults to * force that call on sun4c so we changed this macro slightly */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(ptep_get(ptep), entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); flush_tlb_fix_spurious_fault(vma, address, ptep); } return changed; } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { int young; young = ptep_test_and_clear_young(vma, address, ptep); if (young) flush_tlb_page(vma, address); return young; } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { struct mm_struct *mm = (vma)->vm_mm; pte_t pte; pte = ptep_get_and_clear(mm, address, ptep); if (pte_accessible(mm, pte)) flush_tlb_page(vma, address); return pte; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed) { set_pmd_at(vma->vm_mm, address, pmdp, entry); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } return changed; } #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp) { pud_t pud; VM_BUG_ON(address & ~HPAGE_PUD_MASK); VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return pud; } #endif #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(&pgtable->lru); else list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); pmd_huge_pte(mm, pmdp) = pgtable; } #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW /* no "address" argument so destroys page coloring of some arch */ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ pgtable = pmd_huge_pte(mm, pmdp); pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, struct page, lru); if (pmd_huge_pte(mm, pmdp)) list_del(&pgtable->lru); return pgtable; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); return pmdp_invalidate(vma, address, pmdp); } #endif #ifndef pmdp_collapse_flush pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { /* * pmd and hugepage pte format are same. So we could * use the same function. */ pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); /* collapse entails shooting down ptes not pmd */ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #endif /* arch define pte_free_defer in asm/pgalloc.h for its own implementation */ #ifndef pte_free_defer static void pte_free_now(struct rcu_head *head) { struct page *page; page = container_of(head, struct page, rcu_head); pte_free(NULL /* mm not passed and not used */, (pgtable_t)page); } void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) { struct page *page; page = pgtable; call_rcu(&page->rcu_head, pte_free_now); } #endif /* pte_free_defer */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU)) /* * See the comment above ptep_get_lockless() in include/linux/pgtable.h: * the barriers in pmdp_get_lockless() cannot guarantee that the value in * pmd_high actually belongs with the value in pmd_low; but holding interrupts * off blocks the TLB flush between present updates, which guarantees that a * successful __pte_offset_map() points to a page from matched halves. */ static unsigned long pmdp_get_lockless_start(void) { unsigned long irqflags; local_irq_save(irqflags); return irqflags; } static void pmdp_get_lockless_end(unsigned long irqflags) { local_irq_restore(irqflags); } #else static unsigned long pmdp_get_lockless_start(void) { return 0; } static void pmdp_get_lockless_end(unsigned long irqflags) { } #endif pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { unsigned long irqflags; pmd_t pmdval; rcu_read_lock(); irqflags = pmdp_get_lockless_start(); pmdval = pmdp_get_lockless(pmd); pmdp_get_lockless_end(irqflags); if (pmdvalp) *pmdvalp = pmdval; if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) goto nomap; if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) goto nomap; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); goto nomap; } return __pte_map(&pmdval, addr); nomap: rcu_read_unlock(); return NULL; } pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { pmd_t pmdval; pte_t *pte; pte = __pte_offset_map(pmd, addr, &pmdval); if (likely(pte)) *ptlp = pte_lockptr(mm, &pmdval); return pte; } pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp, spinlock_t **ptlp) { pte_t *pte; VM_WARN_ON_ONCE(!pmdvalp); pte = __pte_offset_map(pmd, addr, pmdvalp); if (likely(pte)) *ptlp = pte_lockptr(mm, pmdvalp); return pte; } /* * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation * __pte_offset_map_lock() below, is usually called with the pmd pointer for * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while * holding mmap_lock or vma lock for read or for write; or in truncate or rmap * context, while holding file's i_mmap_lock or anon_vma lock for read (or for * write). In a few cases, it may be used with pmd pointing to a pmd_t already * copied to or constructed on the stack. * * When successful, it returns the pte pointer for addr, with its page table * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent * modification by software, with a pointer to that spinlock in ptlp (in some * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards. * * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no * page table at *pmd: if, for example, the page table has just been removed, * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked * after acquiring the ptlock, and retried internally if it changed: so that a * page table can be safely removed or replaced by THP while holding its lock.) * * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above, * just returns the pte pointer for addr, its page table kmapped if necessary; * or NULL if there is no page table at *pmd. It does not attempt to lock the * page table, so cannot normally be used when the page table is to be updated, * or when entries read must be stable. But it does take rcu_read_lock(): so * that even when page table is racily removed, it remains a valid though empty * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s * afterwards. * * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); * but when successful, it also outputs a pointer to the spinlock in ptlp - as * pte_offset_map_lock() does, but in this case without locking it. This helps * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock * pointer for the page table that it returns. Even after grabbing the spinlock, * we might be looking either at a page table that is still mapped or one that * was unmapped and is about to get freed. But for R/O access this is sufficient. * So it is only applicable for read-only cases where any modification operations * to the page table are not allowed even if the corresponding spinlock is held * afterwards. * * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval. * It is applicable for may-write cases where any modification operations to the * page table may happen after the corresponding spinlock is held afterwards. * But the users should make sure the page table is stable like checking pte_same() * or checking pmd_same() by using the output pmdval before performing the write * operations. * * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will * be read-only/read-write protected. * * Note that free_pgtables(), used after unmapping detached vmas, or when * exiting the whole mm, does not take page table lock before freeing a page * table, and may not use RCU at all: "outsiders" like khugepaged should avoid * pte_offset_map() and co once the vma is detached from mm or mm_users is zero. */ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { spinlock_t *ptl; pmd_t pmdval; pte_t *pte; again: pte = __pte_offset_map(pmd, addr, &pmdval); if (unlikely(!pte)) return pte; ptl = pte_lockptr(mm, &pmdval); spin_lock(ptl); if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) { *ptlp = ptl; return pte; } pte_unmap_unlock(pte, ptl); goto again; }
25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * Unified UUID/GUID definition * * Copyright (C) 2009, 2016 Intel Corp. * Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/uuid.h> #include <linux/random.h> const guid_t guid_null; EXPORT_SYMBOL(guid_null); const uuid_t uuid_null; EXPORT_SYMBOL(uuid_null); const u8 guid_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; const u8 uuid_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; /** * generate_random_uuid - generate a random UUID * @uuid: where to put the generated UUID * * Random UUID interface * * Used to create a Boot ID or a filesystem UUID/GUID, but can be * useful for other kernel drivers. */ void generate_random_uuid(unsigned char uuid[16]) { get_random_bytes(uuid, 16); /* Set UUID version to 4 --- truly random generation */ uuid[6] = (uuid[6] & 0x0F) | 0x40; /* Set the UUID variant to DCE */ uuid[8] = (uuid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_uuid); void generate_random_guid(unsigned char guid[16]) { get_random_bytes(guid, 16); /* Set GUID version to 4 --- truly random generation */ guid[7] = (guid[7] & 0x0F) | 0x40; /* Set the GUID variant to DCE */ guid[8] = (guid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_guid); static void __uuid_gen_common(__u8 b[16]) { get_random_bytes(b, 16); /* reversion 0b10 */ b[8] = (b[8] & 0x3F) | 0x80; } void guid_gen(guid_t *lu) { __uuid_gen_common(lu->b); /* version 4 : random generation */ lu->b[7] = (lu->b[7] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(guid_gen); void uuid_gen(uuid_t *bu) { __uuid_gen_common(bu->b); /* version 4 : random generation */ bu->b[6] = (bu->b[6] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(uuid_gen); /** * uuid_is_valid - checks if a UUID string is valid * @uuid: UUID string to check * * Description: * It checks if the UUID string is following the format: * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx * * where x is a hex digit. * * Return: true if input is valid UUID string. */ bool uuid_is_valid(const char *uuid) { unsigned int i; for (i = 0; i < UUID_STRING_LEN; i++) { if (i == 8 || i == 13 || i == 18 || i == 23) { if (uuid[i] != '-') return false; } else if (!isxdigit(uuid[i])) { return false; } } return true; } EXPORT_SYMBOL(uuid_is_valid); static int __uuid_parse(const char *uuid, __u8 b[16], const u8 ei[16]) { static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34}; unsigned int i; if (!uuid_is_valid(uuid)) return -EINVAL; for (i = 0; i < 16; i++) { int hi = hex_to_bin(uuid[si[i] + 0]); int lo = hex_to_bin(uuid[si[i] + 1]); b[ei[i]] = (hi << 4) | lo; } return 0; } int guid_parse(const char *uuid, guid_t *u) { return __uuid_parse(uuid, u->b, guid_index); } EXPORT_SYMBOL(guid_parse); int uuid_parse(const char *uuid, uuid_t *u) { return __uuid_parse(uuid, u->b, uuid_index); } EXPORT_SYMBOL(uuid_parse);
6 6 6 6 6 6 6 6 70 52 52 52 52 4 4 4 4 3 51 1 51 6 15 36 51 1 70 70 70 11 1 11 7 57 3 3 3 45 40 23 8 9 20 22 17 12 11 3 12 9 9 27 27 115 114 53 16 20 21 11 10 11 10 11 37 37 6 6 31 37 37 36 30 6 4 2 34 34 3 37 6 6 6 6 6 31 1 29 45 45 44 22 22 22 1 1 23 45 7 46 1 42 2 1 1 44 44 45 57 57 9 46 52 102 9 93 5 97 102 83 14 15 10 1 7 75 65 1 3 1 62 58 2 2 3 1 10 1 3 5 5 2 1 2 3 275 101 10 91 101 101 101 101 101 2 2 111 168 2 1 1 1 1 1 1 2 1 1 159 261 53 53 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 tunneling device * Linux INET6 implementation * * Authors: * Ville Nuorvala <vnuorval@tcs.hut.fi> * Yasuyuki Kozakai <kozakai@linux-ipv6.org> * * Based on: * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c * * RFC 2473 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/etherdevice.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #define IP6_TUNNEL_HASH_SIZE_SHIFT 5 #define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); } static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); static struct rtnl_link_ops ip6_link_ops __read_mostly; static unsigned int ip6_tnl_net_id __read_mostly; struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; struct ip6_tnl __rcu *collect_md_tun; }; static inline int ip6_tnl_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } #define for_each_ip6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * ip6_tnl_lookup(struct net *net, int link, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_equal(remote, &t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_any(&t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(remote, &t->parms.raddr) || !ipv6_addr_any(&t->parms.laddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * ip6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * ip6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } /** * ip6_tnl_link - add tunnel to hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be added **/ static void ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, t); rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } /** * ip6_tnl_unlink - remove tunnel from hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be removed **/ static void ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, NULL); for (tp = ip6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); } static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); ip6_tnl_link(ip6n, t); return 0; out: return err; } /** * ip6_tnl_create - create a new tunnel * @net: network namespace * @p: tunnel parameters * * Description: * Create tunnel matching given parameters. * * Return: * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err = -E2BIG; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6tnl%%d"); } err = -ENOMEM; dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = ip6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } /** * ip6_tnl_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * ip6_tnl_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); for (tp = ip6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); return t; } } if (!create) return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } /** * ip6_tnl_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * ip6_tnl_dev_uninit() removes tunnel from its list **/ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); dst_cache_reset(&t->dst_cache); netdev_put(dev, &t->dev_tracker); } /** * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option * @skb: received socket buffer * @raw: the ICMPv6 error message data * * Return: * 0 if none was found, * else index to encapsulation limit **/ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { optlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { optlen = ipv6_authlen(hdr); } else { optlen = ipv6_optlen(hdr); } if (!pskb_may_pull(skb, off + optlen)) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr; if (frag_hdr->frag_off) break; } if (nexthdr == NEXTHDR_DEST) { u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; /* No more room for encapsulation limit */ if (i + sizeof(*tel) > optlen) break; tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i); /* return index of option if found and valid */ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && tel->length == 1) return i + off - nhoff; /* else jump to next option */ if (tel->type) i += tel->length + 2; else i++; } } nexthdr = hdr->nexthdr; off += optlen; } return 0; } EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); /* ip6_tnl_err() should handle errors in the tunnel according to the * specifications in RFC 2473. */ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; __u32 rel_info = 0; struct ip6_tnl *t; int err = -ENOENT; int rel_msg = 0; u8 tproto; __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further processing of the error. */ rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto out; err = 0; switch (*type) { case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } break; case ICMPV6_PARAMPROB: { struct ipv6_tlv_tnl_enc_lim *tel; __u32 teli; teli = 0; if ((*code) == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } } else { net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } break; } case ICMPV6_PKT_TOOBIG: { __u32 mtu; ip6_update_pmtu(skb, net, htonl(*info), 0, 0, sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; rel_code = 0; rel_info = mtu; rel_msg = 1; } break; } case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); break; } *type = rel_type; *code = rel_code; *info = rel_info; *msg = rel_msg; out: rcu_read_unlock(); return err; } static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); const struct iphdr *eiph; struct sk_buff *skb2; int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg == 0) return 0; switch (rel_type) { case ICMPV6_DEST_UNREACH: if (rel_code != ICMPV6_ADDR_UNREACH) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; break; case ICMPV6_PKT_TOOBIG: if (rel_code != 0) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; default: return 0; } if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) return 0; skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); eiph = ip_hdr(skb2); /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, 0, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, ip4h_dscp(eiph), skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) goto out; } /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { if (rel_info > dst_mtu(skb_dst(skb2))) goto out; skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); out: kfree_skb(skb2); return 0; } static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { struct rt6_info *rt; struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; icmpv6_send(skb2, rel_type, rel_code, rel_info); ip6_rt_put(rt); kfree_skb(skb2); } return 0; } static int mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); return err; } static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); return IP6_ECN_decapsulate(ipv6h, skb); } static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); return IP6_ECN_decapsulate(ipv6h, skb); } static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { /* ECN is not supported in AF_MPLS */ return 0; } __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ltype = ipv6_addr_type(laddr); int rtype = ipv6_addr_type(raddr); __u32 flags = 0; if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { flags = IP6_TNL_F_CAP_PER_PACKET; } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { if (ltype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_XMIT; if (rtype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_RCV; } return flags; } EXPORT_SYMBOL(ip6_tnl_get_cap); /* called with rcu_read_lock() */ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if ((p->flags & IP6_TNL_F_CAP_RCV) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { struct net_device *ldev = NULL; if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if ((ipv6_addr_is_multicast(laddr) || likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) && ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE)))) ret = 1; } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb), bool log_ecn_err) { const struct ipv6hdr *ipv6h; int nh, err; if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } skb->protocol = tpi->proto; /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; skb_reset_mac_header(skb); } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } /* Get the outer header. */ ipv6h = (struct ipv6hdr *)(skb->head + nh); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); if (unlikely(err)) { if (log_ecn_err) net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", &ipv6h->saddr, ipv6_get_dsfield(ipv6h)); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_err) { int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb); dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate; if (tpi->proto == htons(ETH_P_IP)) dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate; return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); static const struct tnl_ptk_info tpi_v6 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IPV6), }; static const struct tnl_ptk_info tpi_v4 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IP), }; static const struct tnl_ptk_info tpi_mpls = { /* no tunnel info required for mplsip6. */ .proto = htons(ETH_P_MPLS_UC), }; static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb)) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct metadata_dst *tun_dst = NULL; int ret = -1; rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); } rcu_read_unlock(); return ret; drop: rcu_read_unlock(); kfree_skb(skb); return 0; } static int ip4ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, ip4ip6_dscp_ecn_decapsulate); } static int ip6ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, ip6ip6_dscp_ecn_decapsulate); } static int mplsip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, mplsip6_dscp_ecn_decapsulate); } struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; }; static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) { memset(opt, 0, sizeof(struct ipv6_tel_txoption)); opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; opt->dst_opt[3] = 1; opt->dst_opt[4] = encap_limit; opt->dst_opt[5] = IPV6_TLV_PADN; opt->dst_opt[6] = 1; opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt; opt->ops.opt_nflen = 8; } /** * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if (t->parms.collect_md) return 1; if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { struct net_device *ldev = NULL; rcu_read_lock(); if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else ret = 1; rcu_read_unlock(); } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); /** * ip6_tnl_xmit - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @dsfield: dscp code for outer header * @fl6: flow of tunneled packet * @encap_limit: encapsulation limit * @pmtu: Path MTU is stored if packet is too big * @proto: next header value * * Description: * Build new header and do some sanity checks on the packet before sending * it. * * Return: * 0 on success * -1 fail * %-EMSGSIZE message too big. return mtu in this case. **/ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ipv6hdr *ipv6h; struct ipv6_tel_txoption opt; struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; __be16 payload_protocol; bool use_cache = false; u8 hop_limit; int err = -1; payload_protocol = skb_protocol(skb, true); if (t->parms.collect_md) { hop_limit = skb_tunnel_info(skb)->key.ttl; goto route_lookup; } else { hop_limit = t->parms.hop_limit; } /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { if (payload_protocol == htons(ETH_P_IPV6)) { struct in6_addr *addr6; struct neighbour *neigh; int addr_type; if (!skb_dst(skb)) goto tx_err_link_failure; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_err_link_failure; addr6 = (struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) addr6 = &ipv6_hdr(skb)->daddr; memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (payload_protocol == htons(ETH_P_IP)) { const struct rtable *rt = skb_rtable(skb); if (!rt) goto tx_err_link_failure; if (rt->rt_gw_family == AF_INET6) memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr)); } } else if (t->parms.proto != 0 && !(t->parms.flags & (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { /* enable the cache only if neither the outer protocol nor the * routing decision depends on the current inner header value */ use_cache = true; } if (use_cache) dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { route_lookup: /* add dsfield to flowlabel for route lookup */ fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel); dst = ip6_route_output(net, NULL, fl6); if (dst->error) goto tx_err_link_failure; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) && ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, &fl6->daddr, 0, &fl6->saddr)) goto tx_err_link_failure; ndst = dst; } tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; } mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } if (t->err_count > 0) { if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) { t->err_count--; dst_link_failure(skb); } else { t->err_count = 0; } } skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom += LL_RESERVED_SPACE(tdev); if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb; new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) goto tx_err_dst_release; if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } if (t->parms.collect_md) { if (t->encap.type != TUNNEL_ENCAP_NONE) goto tx_err_dst_release; } else { if (use_cache && ndst) dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); } skb_dst_set(skb, dst); if (hop_limit == 0) { if (payload_protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) hop_limit = ipv6_hdr(skb)->hop_limit; else hop_limit = ip6_dst_hoplimit(dst); } /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } EXPORT_SYMBOL(ip6_tnl_xmit); static inline int ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; __u16 offset; struct flowi6 fl6; __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; tproto = READ_ONCE(t->parms.proto); if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET6)) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); break; default: orig_dsfield = dsfield; break; } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; if (protocol == IPPROTO_IPV6) { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have * reallocated skb->head */ if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (void *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; } } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = protocol; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); break; default: orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); break; } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; skb_set_inner_ipproto(skb, protocol); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) switch (protocol) { case IPPROTO_IPIP: icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); break; case IPPROTO_IPV6: icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); break; default: break; } return -1; } return 0; } static netdev_tx_t ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; ipproto = IPPROTO_IPV6; break; case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; default: goto tx_err; } ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; int t_hlen; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ fl6->saddr = p->laddr; fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) { tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { dev->needed_headroom = tdev->hard_header_len + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) mtu -= 8; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); } } } /** * ip6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * * Description: * ip6_tnl_change() updates the tunnel parameters **/ static void ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); } static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); ip6_tnl_unlink(ip6n, t); synchronize_net(); ip6_tnl_change(t, p); ip6_tnl_link(ip6n, t); netdev_state_change(t->dev); } static void ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { /* for default tnl0 device allow to change only the proto */ t->parms.proto = p->proto; netdev_state_change(t->dev); } static void ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->flags = u->flags; p->hop_limit = u->hop_limit; p->encap_limit = u->encap_limit; p->flowinfo = u->flowinfo; p->link = u->link; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->flags = p->flags; u->hop_limit = p->hop_limit; u->encap_limit = p->encap_limit; u->flowinfo = p->flowinfo; u->link = p->link; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * ip6_tnl_ioctl() is used for managing IPv6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6tnl0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && p.proto != 0) break; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); if (dev == ip6n->fb_tnl_dev) ip6_tnl0_update(t, &p1); else ip6_tnl_update(t, &p1); } if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { err = PTR_ERR(t); } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } /** * ip6_tnl_change_mtu - change mtu manually for tunnel device * @dev: virtual device associated with tunnel * @new_mtu: the new mtu * * Return: * 0 on success, * %-EINVAL if mtu too small **/ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); int t_hlen; t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip6_tnl_encap_add_ops); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip6_tnl_encap_del_ops); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip6_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; #define IPXIPX_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void ip6_tnl_dev_setup(struct net_device *dev) { dev->netdev_ops = &ip6_tnl_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); dev->features |= IPXIPX_FEATURES; dev->hw_features |= IPXIPX_FEATURES; /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * ip6_tnl_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int ret; int t_hlen; t->dev = dev; t->net = dev_net(dev); ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) return ret; ret = gro_cells_init(&t->gro_cells, dev); if (ret) goto destroy_dst; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; destroy_dst: dst_cache_destroy(&t->dst_cache); return ret; } /** * ip6_tnl_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = ip6_tnl_dev_init_gen(dev); if (err) return err; ip6_tnl_link_config(t); if (t->parms.collect_md) netif_keep_dst(dev); return 0; } /** * ip6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPV6 && proto != IPPROTO_IPIP && proto != 0) return -EINVAL; return 0; } static void ip6_tnl_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); if (data[IFLA_IPTUN_ENCAP_LIMIT]) parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); if (data[IFLA_IPTUN_FLOWINFO]) parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); if (data[IFLA_IPTUN_FLAGS]) parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); if (data[IFLA_IPTUN_PROTO]) parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (data[IFLA_IPTUN_COLLECT_METADATA]) parms->collect_md = true; if (data[IFLA_IPTUN_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; struct ip6_tnl *nt, *t; int err; nt = netdev_priv(dev); if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &nt->parms); if (nt->parms.collect_md) { if (rtnl_dereference(ip6n->collect_md_tun)) return -EEXIST; } else { t = ip6_tnl_locate(net, &nt->parms, 0); if (!IS_ERR(t)) return -EEXIST; } err = ip6_tnl_create2(dev); if (!err && tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return err; } static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; if (dev == ip6n->fb_tnl_dev) return -EINVAL; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &p); if (p.collect_md) return -EINVAL; t = ip6_tnl_locate(net, &p, 0); if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ip6_tnl_update(t, &p); return 0; } static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static size_t ip6_tnl_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_LIMIT */ nla_total_size(1) + /* IFLA_IPTUN_FLOWINFO */ nla_total_size(4) + /* IFLA_IPTUN_FLAGS */ nla_total_size(4) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (parm->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { .kind = "ip6tnl", .maxtype = IFLA_IPTUN_MAX, .policy = ip6_tnl_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6_tnl_dev_setup, .validate = ip6_tnl_validate, .newlink = ip6_tnl_newlink, .changelink = ip6_tnl_changelink, .dellink = ip6_tnl_dellink, .get_size = ip6_tnl_get_size, .fill_info = ip6_tnl_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, }; static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .handler = mplsip6_rcv, .err_handler = mplsip6_err, .priority = 1, }; static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } t = rtnl_dereference(ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } static int __net_init ip6_tnl_init_net(struct net *net) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ ip6n->fb_tnl_dev->netns_local = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) ip6_tnl_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; /** * ip6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init ip6_tunnel_init(void) { int err; if (!ipv6_mod_enabled()) return -EOPNOTSUPP; err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); if (err < 0) { pr_err("%s: can't register ip4ip6\n", __func__); goto out_ip4ip6; } err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); if (err < 0) { pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } if (ip6_tnl_mpls_supported()) { err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); if (err < 0) { pr_err("%s: can't register mplsip6\n", __func__); goto out_mplsip6; } } err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: if (ip6_tnl_mpls_supported()) xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); out_ip4ip6: unregister_pernet_device(&ip6_tnl_net_ops); out_pernet: return err; } /** * ip6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit ip6_tunnel_cleanup(void) { rtnl_link_unregister(&ip6_link_ops); if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) pr_info("%s: can't deregister ip4ip6\n", __func__); if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); if (ip6_tnl_mpls_supported() && xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } module_init(ip6_tunnel_init); module_exit(ip6_tunnel_cleanup);
64 64 3 3 11 11 11 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/module.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfctrl.h> #include <net/caif/cfmuxl.h> #include <net/caif/cffrml.h> #include <net/caif/cfserl.h> #include <net/caif/cfsrvl.h> #include <net/caif/caif_dev.h> #define container_obj(layr) container_of(layr, struct cfcnfg, layer) /* Information about CAIF physical interfaces held by Config Module in order * to manage physical interfaces */ struct cfcnfg_phyinfo { struct list_head node; bool up; /* Pointer to the layer below the MUX (framing layer) */ struct cflayer *frm_layer; /* Pointer to the lowest actual physical layer */ struct cflayer *phy_layer; /* Unique identifier of the physical interface */ unsigned int id; /* Preference of the physical in interface */ enum cfcnfg_phy_preference pref; /* Information about the physical device */ struct dev_info dev_info; /* Interface index */ int ifindex; /* Protocol head room added for CAIF link layer */ int head_room; /* Use Start of frame checksum */ bool use_fcs; }; struct cfcnfg { struct cflayer layer; struct cflayer *ctrl; struct cflayer *mux; struct list_head phys; struct mutex lock; }; static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer); static void cfctrl_resp_func(void); static void cfctrl_enum_resp(void); struct cfcnfg *cfcnfg_create(void) { struct cfcnfg *this; struct cfctrl_rsp *resp; might_sleep(); /* Initiate this layer */ this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC); if (!this) return NULL; this->mux = cfmuxl_create(); if (!this->mux) goto out_of_mem; this->ctrl = cfctrl_create(); if (!this->ctrl) goto out_of_mem; /* Initiate response functions */ resp = cfctrl_get_respfuncs(this->ctrl); resp->enum_rsp = cfctrl_enum_resp; resp->linkerror_ind = cfctrl_resp_func; resp->linkdestroy_rsp = cfcnfg_linkdestroy_rsp; resp->sleep_rsp = cfctrl_resp_func; resp->wake_rsp = cfctrl_resp_func; resp->restart_rsp = cfctrl_resp_func; resp->radioset_rsp = cfctrl_resp_func; resp->linksetup_rsp = cfcnfg_linkup_rsp; resp->reject_rsp = cfcnfg_reject_rsp; INIT_LIST_HEAD(&this->phys); cfmuxl_set_uplayer(this->mux, this->ctrl, 0); layer_set_dn(this->ctrl, this->mux); layer_set_up(this->ctrl, this); mutex_init(&this->lock); return this; out_of_mem: synchronize_rcu(); kfree(this->mux); kfree(this->ctrl); kfree(this); return NULL; } void cfcnfg_remove(struct cfcnfg *cfg) { might_sleep(); if (cfg) { synchronize_rcu(); kfree(cfg->mux); cfctrl_remove(cfg->ctrl); kfree(cfg); } } static void cfctrl_resp_func(void) { } static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg, u8 phyid) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->id == phyid) return phy; return NULL; } static void cfctrl_enum_resp(void) { } static struct dev_info *cfcnfg_get_phyid(struct cfcnfg *cnfg, enum cfcnfg_phy_preference phy_pref) { /* Try to match with specified preference */ struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) { if (phy->up && phy->pref == phy_pref && phy->frm_layer != NULL) return &phy->dev_info; } /* Otherwise just return something */ list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->up) return &phy->dev_info; return NULL; } static int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->ifindex == ifi && phy->up) return phy->id; return -ENODEV; } int caif_disconnect_client(struct net *net, struct cflayer *adap_layer) { u8 channel_id; struct cfcnfg *cfg = get_cfcnfg(net); caif_assert(adap_layer != NULL); cfctrl_cancel_req(cfg->ctrl, adap_layer); channel_id = adap_layer->id; if (channel_id != 0) { struct cflayer *servl; servl = cfmuxl_remove_uplayer(cfg->mux, channel_id); cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer); if (servl != NULL) layer_set_up(servl, NULL); } else pr_debug("nothing to disconnect\n"); /* Do RCU sync before initiating cleanup */ synchronize_rcu(); if (adap_layer->ctrlcmd != NULL) adap_layer->ctrlcmd(adap_layer, CAIF_CTRLCMD_DEINIT_RSP, 0); return 0; } EXPORT_SYMBOL(caif_disconnect_client); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id) { } static const int protohead[CFCTRL_SRV_MASK] = { [CFCTRL_SRV_VEI] = 4, [CFCTRL_SRV_DATAGRAM] = 7, [CFCTRL_SRV_UTIL] = 4, [CFCTRL_SRV_RFM] = 3, [CFCTRL_SRV_DBG] = 3, }; static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, struct caif_connect_request *s, struct cfctrl_link_param *l) { struct dev_info *dev_info; enum cfcnfg_phy_preference pref; int res; memset(l, 0, sizeof(*l)); /* In caif protocol low value is high priority */ l->priority = CAIF_PRIO_MAX - s->priority + 1; if (s->ifindex != 0) { res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex); if (res < 0) return res; l->phyid = res; } else { switch (s->link_selector) { case CAIF_LINK_HIGH_BANDW: pref = CFPHYPREF_HIGH_BW; break; case CAIF_LINK_LOW_LATENCY: pref = CFPHYPREF_LOW_LAT; break; default: return -EINVAL; } dev_info = cfcnfg_get_phyid(cnfg, pref); if (dev_info == NULL) return -ENODEV; l->phyid = dev_info->id; } switch (s->protocol) { case CAIFPROTO_AT: l->linktype = CFCTRL_SRV_VEI; l->endpoint = (s->sockaddr.u.at.type >> 2) & 0x3; l->chtype = s->sockaddr.u.at.type & 0x3; break; case CAIFPROTO_DATAGRAM: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_DATAGRAM_LOOP: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x03; l->endpoint = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_RFM: l->linktype = CFCTRL_SRV_RFM; l->u.datagram.connid = s->sockaddr.u.rfm.connection_id; strscpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, sizeof(l->u.rfm.volume)); break; case CAIFPROTO_UTIL: l->linktype = CFCTRL_SRV_UTIL; l->endpoint = 0x00; l->chtype = 0x00; strscpy(l->u.utility.name, s->sockaddr.u.util.service, sizeof(l->u.utility.name)); caif_assert(sizeof(l->u.utility.name) > 10); l->u.utility.paramlen = s->param.size; if (l->u.utility.paramlen > sizeof(l->u.utility.params)) l->u.utility.paramlen = sizeof(l->u.utility.params); memcpy(l->u.utility.params, s->param.data, l->u.utility.paramlen); break; case CAIFPROTO_DEBUG: l->linktype = CFCTRL_SRV_DBG; l->endpoint = s->sockaddr.u.dbg.service; l->chtype = s->sockaddr.u.dbg.type; break; default: return -EINVAL; } return 0; } int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, struct cflayer *adap_layer, int *ifindex, int *proto_head, int *proto_tail) { struct cflayer *frml; struct cfcnfg_phyinfo *phy; int err; struct cfctrl_link_param param; struct cfcnfg *cfg = get_cfcnfg(net); rcu_read_lock(); err = caif_connect_req_to_link_param(cfg, conn_req, &param); if (err) goto unlock; phy = cfcnfg_get_phyinfo_rcu(cfg, param.phyid); if (!phy) { err = -ENODEV; goto unlock; } err = -EINVAL; if (adap_layer == NULL) { pr_err("adap_layer is zero\n"); goto unlock; } if (adap_layer->receive == NULL) { pr_err("adap_layer->receive is NULL\n"); goto unlock; } if (adap_layer->ctrlcmd == NULL) { pr_err("adap_layer->ctrlcmd == NULL\n"); goto unlock; } err = -ENODEV; frml = phy->frm_layer; if (frml == NULL) { pr_err("Specified PHY type does not exist!\n"); goto unlock; } caif_assert(param.phyid == phy->id); caif_assert(phy->frm_layer->id == param.phyid); caif_assert(phy->phy_layer->id == param.phyid); *ifindex = phy->ifindex; *proto_tail = 2; *proto_head = protohead[param.linktype] + phy->head_room; rcu_read_unlock(); /* FIXME: ENUMERATE INITIALLY WHEN ACTIVATING PHYSICAL INTERFACE */ cfctrl_enum_req(cfg->ctrl, param.phyid); return cfctrl_linkup_request(cfg->ctrl, &param, adap_layer); unlock: rcu_read_unlock(); return err; } EXPORT_SYMBOL(caif_connect_client); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer) { if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); } static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer) { struct cfcnfg *cnfg = container_obj(layer); struct cflayer *servicel = NULL; struct cfcnfg_phyinfo *phyinfo; struct net_device *netdev; if (channel_id == 0) { pr_warn("received channel_id zero\n"); if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); return; } rcu_read_lock(); if (adapt_layer == NULL) { pr_debug("link setup response but no client exist, send linkdown back\n"); cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL); goto unlock; } caif_assert(cnfg != NULL); caif_assert(phyid != 0); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { pr_err("ERROR: Link Layer Device disappeared while connecting\n"); goto unlock; } caif_assert(phyinfo != NULL); caif_assert(phyinfo->id == phyid); caif_assert(phyinfo->phy_layer != NULL); caif_assert(phyinfo->phy_layer->id == phyid); adapt_layer->id = channel_id; switch (serv) { case CFCTRL_SRV_VEI: servicel = cfvei_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DATAGRAM: servicel = cfdgml_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_RFM: netdev = phyinfo->dev_info.dev; servicel = cfrfml_create(channel_id, &phyinfo->dev_info, netdev->mtu); break; case CFCTRL_SRV_UTIL: servicel = cfutill_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_VIDEO: servicel = cfvidl_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DBG: servicel = cfdbgl_create(channel_id, &phyinfo->dev_info); break; default: pr_err("Protocol error. Link setup response - unknown channel type\n"); goto unlock; } if (!servicel) goto unlock; layer_set_dn(servicel, cnfg->mux); cfmuxl_set_uplayer(cnfg->mux, servicel, channel_id); layer_set_up(servicel, adapt_layer); layer_set_dn(adapt_layer, servicel); rcu_read_unlock(); servicel->ctrlcmd(servicel, CAIF_CTRLCMD_INIT_RSP, 0); return; unlock: rcu_read_unlock(); } int cfcnfg_add_phy_layer(struct cfcnfg *cnfg, struct net_device *dev, struct cflayer *phy_layer, enum cfcnfg_phy_preference pref, struct cflayer *link_support, bool fcs, int head_room) { struct cflayer *frml; struct cfcnfg_phyinfo *phyinfo = NULL; int i, res = 0; u8 phyid; mutex_lock(&cnfg->lock); /* CAIF protocol allow maximum 6 link-layers */ for (i = 0; i < 7; i++) { phyid = (dev->ifindex + i) & 0x7; if (phyid == 0) continue; if (cfcnfg_get_phyinfo_rcu(cnfg, phyid) == NULL) goto got_phyid; } pr_warn("Too many CAIF Link Layers (max 6)\n"); res = -EEXIST; goto out; got_phyid: phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); if (!phyinfo) { res = -ENOMEM; goto out; } phy_layer->id = phyid; phyinfo->pref = pref; phyinfo->id = phyid; phyinfo->dev_info.id = phyid; phyinfo->dev_info.dev = dev; phyinfo->phy_layer = phy_layer; phyinfo->ifindex = dev->ifindex; phyinfo->head_room = head_room; phyinfo->use_fcs = fcs; frml = cffrml_create(phyid, fcs); if (!frml) { res = -ENOMEM; goto out_err; } phyinfo->frm_layer = frml; layer_set_up(frml, cnfg->mux); if (link_support != NULL) { link_support->id = phyid; layer_set_dn(frml, link_support); layer_set_up(link_support, frml); layer_set_dn(link_support, phy_layer); layer_set_up(phy_layer, link_support); } else { layer_set_dn(frml, phy_layer); layer_set_up(phy_layer, frml); } list_add_rcu(&phyinfo->node, &cnfg->phys); out: mutex_unlock(&cnfg->lock); return res; out_err: kfree(phyinfo); mutex_unlock(&cnfg->lock); return res; } EXPORT_SYMBOL(cfcnfg_add_phy_layer); int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer, bool up) { struct cfcnfg_phyinfo *phyinfo; rcu_read_lock(); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phy_layer->id); if (phyinfo == NULL) { rcu_read_unlock(); return -ENODEV; } if (phyinfo->up == up) { rcu_read_unlock(); return 0; } phyinfo->up = up; if (up) { cffrml_hold(phyinfo->frm_layer); cfmuxl_set_dnlayer(cnfg->mux, phyinfo->frm_layer, phy_layer->id); } else { cfmuxl_remove_dnlayer(cnfg->mux, phy_layer->id); cffrml_put(phyinfo->frm_layer); } rcu_read_unlock(); return 0; } EXPORT_SYMBOL(cfcnfg_set_phy_state); int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer) { struct cflayer *frml, *frml_dn; u16 phyid; struct cfcnfg_phyinfo *phyinfo; might_sleep(); mutex_lock(&cnfg->lock); phyid = phy_layer->id; phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { mutex_unlock(&cnfg->lock); return 0; } caif_assert(phyid == phyinfo->id); caif_assert(phy_layer == phyinfo->phy_layer); caif_assert(phy_layer->id == phyid); caif_assert(phyinfo->frm_layer->id == phyid); list_del_rcu(&phyinfo->node); synchronize_rcu(); /* Fail if reference count is not zero */ if (cffrml_refcnt_read(phyinfo->frm_layer) != 0) { pr_info("Wait for device inuse\n"); list_add_rcu(&phyinfo->node, &cnfg->phys); mutex_unlock(&cnfg->lock); return -EAGAIN; } frml = phyinfo->frm_layer; frml_dn = frml->dn; cffrml_set_uplayer(frml, NULL); cffrml_set_dnlayer(frml, NULL); if (phy_layer != frml_dn) { layer_set_up(frml_dn, NULL); layer_set_dn(frml_dn, NULL); } layer_set_up(phy_layer, NULL); if (phyinfo->phy_layer != frml_dn) kfree(frml_dn); cffrml_free(frml); kfree(phyinfo); mutex_unlock(&cnfg->lock); return 0; } EXPORT_SYMBOL(cfcnfg_del_phy_layer);
5 4 1 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * linux/can/skb.h * * Definitions for the CAN network socket buffer * * Copyright (C) 2012 Oliver Hartkopp <socketcan@hartkopp.net> * */ #ifndef _CAN_SKB_H #define _CAN_SKB_H #include <linux/types.h> #include <linux/skbuff.h> #include <linux/can.h> #include <net/sock.h> void can_flush_echo_skb(struct net_device *dev); int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *len_ptr, unsigned int *frame_len_ptr); unsigned int __must_check can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr); void can_free_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr); struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); struct sk_buff *alloc_canfd_skb(struct net_device *dev, struct canfd_frame **cfd); struct sk_buff *alloc_canxl_skb(struct net_device *dev, struct canxl_frame **cxl, unsigned int data_len); struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf); bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb); /* * The struct can_skb_priv is used to transport additional information along * with the stored struct can(fd)_frame that can not be contained in existing * struct sk_buff elements. * N.B. that this information must not be modified in cloned CAN sk_buffs. * To modify the CAN frame content or the struct can_skb_priv content * skb_copy() needs to be used instead of skb_clone(). */ /** * struct can_skb_priv - private additional data inside CAN sk_buffs * @ifindex: ifindex of the first interface the CAN frame appeared on * @skbcnt: atomic counter to have an unique id together with skb pointer * @frame_len: length of CAN frame in data link layer * @cf: align to the following CAN frame at skb->data */ struct can_skb_priv { int ifindex; int skbcnt; unsigned int frame_len; struct can_frame cf[]; }; static inline struct can_skb_priv *can_skb_prv(struct sk_buff *skb) { return (struct can_skb_priv *)(skb->head); } static inline void can_skb_reserve(struct sk_buff *skb) { skb_reserve(skb, sizeof(struct can_skb_priv)); } static inline void can_skb_set_owner(struct sk_buff *skb, struct sock *sk) { /* If the socket has already been closed by user space, the * refcount may already be 0 (and the socket will be freed * after the last TX skb has been freed). So only increase * socket refcount if the refcount is > 0. */ if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { skb->destructor = sock_efree; skb->sk = sk; } } /* * returns an unshared skb owned by the original sock to be echo'ed back */ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb) { struct sk_buff *nskb; nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { kfree_skb(skb); return NULL; } can_skb_set_owner(nskb, skb->sk); consume_skb(skb); return nskb; } static inline bool can_is_can_skb(const struct sk_buff *skb) { struct can_frame *cf = (struct can_frame *)skb->data; /* the CAN specific type of skb is identified by its data length */ return (skb->len == CAN_MTU && cf->len <= CAN_MAX_DLEN); } static inline bool can_is_canfd_skb(const struct sk_buff *skb) { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; /* the CAN specific type of skb is identified by its data length */ return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN); } static inline bool can_is_canxl_skb(const struct sk_buff *skb) { const struct canxl_frame *cxl = (struct canxl_frame *)skb->data; if (skb->len < CANXL_HDR_SIZE + CANXL_MIN_DLEN || skb->len > CANXL_MTU) return false; /* this also checks valid CAN XL data length boundaries */ if (skb->len != CANXL_HDR_SIZE + cxl->len) return false; return cxl->flags & CANXL_XLF; } /* get length element value from can[|fd|xl]_frame structure */ static inline unsigned int can_skb_get_len_val(struct sk_buff *skb) { const struct canxl_frame *cxl = (struct canxl_frame *)skb->data; const struct canfd_frame *cfd = (struct canfd_frame *)skb->data; if (can_is_canxl_skb(skb)) return cxl->len; return cfd->len; } /* get needed data length inside CAN frame for all frame types (RTR aware) */ static inline unsigned int can_skb_get_data_len(struct sk_buff *skb) { unsigned int len = can_skb_get_len_val(skb); const struct can_frame *cf = (struct can_frame *)skb->data; /* RTR frames have an actual length of zero */ if (can_is_can_skb(skb) && cf->can_id & CAN_RTR_FLAG) return 0; return len; } #endif /* !_CAN_SKB_H */
6 6 6 6 9 9 8 8 4 4 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Authors Mitsuru KANDA <mk@linux-ipv6.org> * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); static inline int xfrm6_tunnel_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) { struct xfrm6_tunnel __rcu **pprev; struct xfrm6_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel6_mutex); switch (family) { case AF_INET6: pprev = &tunnel6_handlers; break; case AF_INET: pprev = &tunnel46_handlers; break; case AF_MPLS: pprev = &tunnelmpls6_handlers; break; default: goto err; } for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t->priority > priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&tunnel6_mutex); return ret; } EXPORT_SYMBOL(xfrm6_tunnel_register); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) { struct xfrm6_tunnel __rcu **pprev; struct xfrm6_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel6_mutex); switch (family) { case AF_INET6: pprev = &tunnel6_handlers; break; case AF_INET: pprev = &tunnel46_handlers; break; case AF_MPLS: pprev = &tunnelmpls6_handlers; break; default: goto err; } for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } err: mutex_unlock(&tunnel6_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm6_tunnel_deregister); #define for_each_tunnel_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int tunnelmpls6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for_each_tunnel_rcu(tunnelmpls6_handlers, handler) if (!handler->handler(skb)) return 0; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } static int tunnel6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->handler(skb)) return 0; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) static int tunnel6_rcv_cb(struct sk_buff *skb, u8 proto, int err) { struct xfrm6_tunnel __rcu *head; struct xfrm6_tunnel *handler; int ret; head = (proto == IPPROTO_IPV6) ? tunnel6_handlers : tunnel46_handlers; for_each_tunnel_rcu(head, handler) { if (handler->cb_handler) { ret = handler->cb_handler(skb, err); if (ret <= 0) return ret; } } return 0; } static const struct xfrm_input_afinfo tunnel6_input_afinfo = { .family = AF_INET6, .is_ipip = true, .callback = tunnel6_rcv_cb, }; #endif static int tunnel46_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->handler(skb)) return 0; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnelmpls6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static const struct inet6_protocol tunnel6_protocol = { .handler = tunnel6_rcv, .err_handler = tunnel6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; static const struct inet6_protocol tunnel46_protocol = { .handler = tunnel46_rcv, .err_handler = tunnel46_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; static const struct inet6_protocol tunnelmpls6_protocol = { .handler = tunnelmpls6_rcv, .err_handler = tunnelmpls6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; static int __init tunnel6_init(void) { if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) { pr_err("%s: can't add protocol\n", __func__); return -EAGAIN; } if (inet6_add_protocol(&tunnel46_protocol, IPPROTO_IPIP)) { pr_err("%s: can't add protocol\n", __func__); inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); return -EAGAIN; } if (xfrm6_tunnel_mpls_supported() && inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) { pr_err("%s: can't add protocol\n", __func__); inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); return -EAGAIN; } #if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) if (xfrm_input_register_afinfo(&tunnel6_input_afinfo)) { pr_err("%s: can't add input afinfo\n", __func__); inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); if (xfrm6_tunnel_mpls_supported()) inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS); return -EAGAIN; } #endif return 0; } static void __exit tunnel6_fini(void) { #if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) if (xfrm_input_unregister_afinfo(&tunnel6_input_afinfo)) pr_err("%s: can't remove input afinfo\n", __func__); #endif if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP)) pr_err("%s: can't remove protocol\n", __func__); if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) pr_err("%s: can't remove protocol\n", __func__); if (xfrm6_tunnel_mpls_supported() && inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) pr_err("%s: can't remove protocol\n", __func__); } module_init(tunnel6_init); module_exit(tunnel6_fini); MODULE_DESCRIPTION("IP-in-IPv6 tunnel driver"); MODULE_LICENSE("GPL");
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0-only */ /* * File: pep.h * * Phonet Pipe End Point sockets definitions * * Copyright (C) 2008 Nokia Corporation. */ #ifndef NET_PHONET_PEP_H #define NET_PHONET_PEP_H #include <linux/skbuff.h> #include <net/phonet/phonet.h> struct pep_sock { struct pn_sock pn_sk; /* XXX: union-ify listening vs connected stuff ? */ /* Listening socket stuff: */ struct hlist_head hlist; /* Connected socket stuff: */ struct sock *listener; struct sk_buff_head ctrlreq_queue; #define PNPIPE_CTRLREQ_MAX 10 atomic_t tx_credits; int ifindex; u16 peer_type; /* peer type/subtype */ u8 pipe_handle; u8 rx_credits; u8 rx_fc; /* RX flow control */ u8 tx_fc; /* TX flow control */ u8 init_enable; /* auto-enable at creation */ u8 aligned; }; static inline struct pep_sock *pep_sk(struct sock *sk) { return (struct pep_sock *)sk; } extern const struct proto_ops phonet_stream_ops; /* Pipe protocol definitions */ struct pnpipehdr { u8 utid; /* transaction ID */ u8 message_id; u8 pipe_handle; union { u8 state_after_connect; /* connect request */ u8 state_after_reset; /* reset request */ u8 error_code; /* any response */ u8 pep_type; /* status indication */ u8 data0; /* anything else */ }; u8 data[]; }; #define other_pep_type data[0] static inline struct pnpipehdr *pnp_hdr(struct sk_buff *skb) { return (struct pnpipehdr *)skb_transport_header(skb); } #define MAX_PNPIPE_HEADER (MAX_PHONET_HEADER + 4) enum { PNS_PIPE_CREATE_REQ = 0x00, PNS_PIPE_CREATE_RESP, PNS_PIPE_REMOVE_REQ, PNS_PIPE_REMOVE_RESP, PNS_PIPE_DATA = 0x20, PNS_PIPE_ALIGNED_DATA, PNS_PEP_CONNECT_REQ = 0x40, PNS_PEP_CONNECT_RESP, PNS_PEP_DISCONNECT_REQ, PNS_PEP_DISCONNECT_RESP, PNS_PEP_RESET_REQ, PNS_PEP_RESET_RESP, PNS_PEP_ENABLE_REQ, PNS_PEP_ENABLE_RESP, PNS_PEP_CTRL_REQ, PNS_PEP_CTRL_RESP, PNS_PEP_DISABLE_REQ = 0x4C, PNS_PEP_DISABLE_RESP, PNS_PEP_STATUS_IND = 0x60, PNS_PIPE_CREATED_IND, PNS_PIPE_RESET_IND = 0x63, PNS_PIPE_ENABLED_IND, PNS_PIPE_REDIRECTED_IND, PNS_PIPE_DISABLED_IND = 0x66, }; #define PN_PIPE_INVALID_HANDLE 0xff #define PN_PEP_TYPE_COMMON 0x00 /* Phonet pipe status indication */ enum { PN_PEP_IND_FLOW_CONTROL, PN_PEP_IND_ID_MCFC_GRANT_CREDITS, }; /* Phonet pipe error codes */ enum { PN_PIPE_NO_ERROR, PN_PIPE_ERR_INVALID_PARAM, PN_PIPE_ERR_INVALID_HANDLE, PN_PIPE_ERR_INVALID_CTRL_ID, PN_PIPE_ERR_NOT_ALLOWED, PN_PIPE_ERR_PEP_IN_USE, PN_PIPE_ERR_OVERLOAD, PN_PIPE_ERR_DEV_DISCONNECTED, PN_PIPE_ERR_TIMEOUT, PN_PIPE_ERR_ALL_PIPES_IN_USE, PN_PIPE_ERR_GENERAL, PN_PIPE_ERR_NOT_SUPPORTED, }; /* Phonet pipe states */ enum { PN_PIPE_DISABLE, PN_PIPE_ENABLE, }; /* Phonet pipe sub-block types */ enum { PN_PIPE_SB_CREATE_REQ_PEP_SUB_TYPE, PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE, PN_PIPE_SB_REDIRECT_REQ_PEP_SUB_TYPE, PN_PIPE_SB_NEGOTIATED_FC, PN_PIPE_SB_REQUIRED_FC_TX, PN_PIPE_SB_PREFERRED_FC_RX, PN_PIPE_SB_ALIGNED_DATA, }; /* Phonet pipe flow control models */ enum { PN_NO_FLOW_CONTROL, PN_LEGACY_FLOW_CONTROL, PN_ONE_CREDIT_FLOW_CONTROL, PN_MULTI_CREDIT_FLOW_CONTROL, PN_MAX_FLOW_CONTROL, }; #define pn_flow_safe(fc) ((fc) >> 1) /* Phonet pipe flow control states */ enum { PEP_IND_EMPTY, PEP_IND_BUSY, PEP_IND_READY, }; #endif
69 69 3 69 86 84 7 8 8 99 99 78 21 78 78 78 78 77 78 78 78 78 78 77 78 78 78 78 5 8 78 78 77 78 69 68 69 24 109 16 16 16 13 16 18 9 18 25 25 25 15 87 87 16 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC individual remote procedure call handling * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/module.h> #include <linux/circ_buf.h> #include <linux/spinlock_types.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_UNINITIALISED] = "Uninit ", [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn", [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK", [RXRPC_CALL_COMPLETE] = "Complete", }; const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = { [RXRPC_CALL_SUCCEEDED] = "Complete", [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort", [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort", [RXRPC_CALL_LOCAL_ERROR] = "LocError", [RXRPC_CALL_NETWORK_ERROR] = "NetError", }; struct kmem_cache *rxrpc_call_jar; static DEFINE_SEMAPHORE(rxrpc_call_limiter, 1000); static DEFINE_SEMAPHORE(rxrpc_kernel_call_limiter, 1000); void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) { struct rxrpc_local *local = call->local; bool busy; if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) { spin_lock_irq(&local->lock); busy = !list_empty(&call->attend_link); trace_rxrpc_poke_call(call, busy, what); if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke)) busy = true; if (!busy) { list_add_tail(&call->attend_link, &local->call_attend_q); } spin_unlock_irq(&local->lock); if (!busy) rxrpc_wake_up_io_thread(local); } } static void rxrpc_call_timer_expired(struct timer_list *t) { struct rxrpc_call *call = from_timer(call, t, timer); _enter("%d", call->debug_id); if (!__rxrpc_call_is_complete(call)) { trace_rxrpc_timer_expired(call); rxrpc_poke_call(call, rxrpc_call_poke_timer); } } static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; static void rxrpc_destroy_call(struct work_struct *); /* * find an extant server call * - called in process context with IRQs enabled */ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx, unsigned long user_call_ID) { struct rxrpc_call *call; struct rb_node *p; _enter("%p,%lx", rx, user_call_ID); read_lock(&rx->call_lock); p = rx->calls.rb_node; while (p) { call = rb_entry(p, struct rxrpc_call, sock_node); if (user_call_ID < call->user_call_ID) p = p->rb_left; else if (user_call_ID > call->user_call_ID) p = p->rb_right; else goto found_extant_call; } read_unlock(&rx->call_lock); _leave(" = NULL"); return NULL; found_extant_call: rxrpc_get_call(call, rxrpc_call_get_sendmsg); read_unlock(&rx->call_lock); _leave(" = %p [%d]", call, refcount_read(&call->ref)); return call; } /* * allocate a new call */ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, unsigned int debug_id) { struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); call = kmem_cache_zalloc(rxrpc_call_jar, gfp); if (!call) return NULL; mutex_init(&call->user_mutex); /* Prevent lockdep reporting a deadlock false positive between the afs * filesystem and sys_sendmsg() via the mmap sem. */ if (rx->sk.sk_kern_sock) lockdep_set_class(&call->user_mutex, &rxrpc_call_user_mutex_lock_class_key); timer_setup(&call->timer, rxrpc_call_timer_expired, 0); INIT_WORK(&call->destroyer, rxrpc_destroy_call); INIT_LIST_HEAD(&call->link); INIT_LIST_HEAD(&call->wait_link); INIT_LIST_HEAD(&call->accept_link); INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->recvmsg_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; call->tx_total_len = -1; call->tx_jumbo_max = 1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; call->ackr_window = 1; call->ackr_wtop = 1; call->delay_ack_at = KTIME_MAX; call->rack_timo_at = KTIME_MAX; call->ping_at = KTIME_MAX; call->keepalive_at = KTIME_MAX; call->expect_rx_by = KTIME_MAX; call->expect_req_by = KTIME_MAX; call->expect_term_by = KTIME_MAX; memset(&call->sock_node, 0xed, sizeof(call->sock_node)); call->rx_winsize = rxrpc_rx_window_size; call->tx_winsize = 16; call->cong_cwnd = RXRPC_MIN_CWND; call->cong_ssthresh = RXRPC_TX_MAX_WINDOW; rxrpc_call_init_rtt(call); call->rxnet = rxnet; call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK; atomic_inc(&rxnet->nr_calls); return call; } /* * Allocate a new client call. */ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) { struct rxrpc_call *call; ktime_t now; int ret; _enter(""); call = rxrpc_alloc_call(rx, gfp, debug_id); if (!call) return ERR_PTR(-ENOMEM); now = ktime_get_real(); call->acks_latest_ts = now; call->cong_tstamp = now; call->dest_srx = cp->peer->srx; call->dest_srx.srx_service = cp->service_id; call->interruptibility = p->interruptibility; call->tx_total_len = p->tx_total_len; call->key = key_get(cp->key); call->peer = rxrpc_get_peer(cp->peer, rxrpc_peer_get_call); call->local = rxrpc_get_local(cp->local, rxrpc_local_get_call); call->security_level = cp->security_level; if (p->kernel) __set_bit(RXRPC_CALL_KERNEL, &call->flags); if (cp->upgrade) __set_bit(RXRPC_CALL_UPGRADE, &call->flags); if (cp->exclusive) __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); if (p->timeouts.normal) call->next_rx_timo = umin(p->timeouts.normal, 1); if (p->timeouts.idle) call->next_req_timo = umin(p->timeouts.idle, 1); if (p->timeouts.hard) call->hard_timo = p->timeouts.hard; ret = rxrpc_init_client_call_security(call); if (ret < 0) { rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, ret); rxrpc_put_call(call, rxrpc_call_put_discard_error); return ERR_PTR(ret); } rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_AWAIT_CONN); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), p->user_call_ID, rxrpc_call_new_client); _leave(" = %p", call); return call; } /* * Initiate the call ack/resend/expiry timer. */ void rxrpc_start_call_timer(struct rxrpc_call *call) { if (call->hard_timo) { ktime_t delay = ms_to_ktime(call->hard_timo * 1000); call->expect_term_by = ktime_add(ktime_get_real(), delay); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); } call->timer.expires = jiffies; } /* * Wait for a call slot to become available. */ static struct semaphore *rxrpc_get_call_slot(struct rxrpc_call_params *p, gfp_t gfp) { struct semaphore *limiter = &rxrpc_call_limiter; if (p->kernel) limiter = &rxrpc_kernel_call_limiter; if (p->interruptibility == RXRPC_UNINTERRUPTIBLE) { down(limiter); return limiter; } return down_interruptible(limiter) < 0 ? NULL : limiter; } /* * Release a call slot. */ static void rxrpc_put_call_slot(struct rxrpc_call *call) { struct semaphore *limiter = &rxrpc_call_limiter; if (test_bit(RXRPC_CALL_KERNEL, &call->flags)) limiter = &rxrpc_kernel_call_limiter; up(limiter); } /* * Start the process of connecting a call. We obtain a peer and a connection * bundle, but the actual association of a call with a connection is offloaded * to the I/O thread to simplify locking. */ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) { struct rxrpc_local *local = call->local; int ret = -ENOMEM; _enter("{%d,%lx},", call->debug_id, call->user_call_ID); ret = rxrpc_look_up_bundle(call, gfp); if (ret < 0) goto error; trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call); rxrpc_get_call(call, rxrpc_call_get_io_thread); spin_lock_irq(&local->client_call_lock); list_add_tail(&call->wait_link, &local->new_client_calls); spin_unlock_irq(&local->client_call_lock); rxrpc_wake_up_io_thread(local); return 0; error: __set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); return ret; } /* * Set up a call for the given parameters. * - Called with the socket lock held, which it must release. * - If it returns a call, the call's lock will need releasing by the caller. */ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) __releases(&rx->sk.sk_lock.slock) __acquires(&call->user_mutex) { struct rxrpc_call *call, *xcall; struct rxrpc_net *rxnet; struct semaphore *limiter; struct rb_node *parent, **pp; int ret; _enter("%p,%lx", rx, p->user_call_ID); if (WARN_ON_ONCE(!cp->peer)) { release_sock(&rx->sk); return ERR_PTR(-EIO); } limiter = rxrpc_get_call_slot(p, gfp); if (!limiter) { release_sock(&rx->sk); return ERR_PTR(-ERESTARTSYS); } call = rxrpc_alloc_client_call(rx, cp, p, gfp, debug_id); if (IS_ERR(call)) { release_sock(&rx->sk); up(limiter); _leave(" = %ld", PTR_ERR(call)); return call; } /* We need to protect a partially set up call against the user as we * will be acting outside the socket lock. */ mutex_lock(&call->user_mutex); /* Publish the call, even though it is incompletely set up as yet */ write_lock(&rx->call_lock); pp = &rx->calls.rb_node; parent = NULL; while (*pp) { parent = *pp; xcall = rb_entry(parent, struct rxrpc_call, sock_node); if (p->user_call_ID < xcall->user_call_ID) pp = &(*pp)->rb_left; else if (p->user_call_ID > xcall->user_call_ID) pp = &(*pp)->rb_right; else goto error_dup_user_ID; } rcu_assign_pointer(call->socket, rx); call->user_call_ID = p->user_call_ID; __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); rxrpc_get_call(call, rxrpc_call_get_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); list_add(&call->sock_link, &rx->sock_calls); write_unlock(&rx->call_lock); rxnet = call->rxnet; spin_lock(&rxnet->call_lock); list_add_tail_rcu(&call->link, &rxnet->calls); spin_unlock(&rxnet->call_lock); /* From this point on, the call is protected by its own lock. */ release_sock(&rx->sk); /* Set up or get a connection record and set the protocol parameters, * including channel number and call ID. */ ret = rxrpc_connect_call(call, gfp); if (ret < 0) goto error_attached_to_socket; _leave(" = %p [new]", call); return call; /* We unexpectedly found the user ID in the list after taking * the call_lock. This shouldn't happen unless the user races * with itself and tries to add the same user ID twice at the * same time in different threads. */ error_dup_user_ID: write_unlock(&rx->call_lock); release_sock(&rx->sk); rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, -EEXIST); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), 0, rxrpc_call_see_userid_exists); mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_userid_exists); _leave(" = -EEXIST"); return ERR_PTR(-EEXIST); /* We got an error, but the call is attached to the socket and is in * need of release. However, we might now race with recvmsg() when it * completion notifies the socket. Return 0 from sys_sendmsg() and * leave the error to recvmsg() to deal with. */ error_attached_to_socket: trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), ret, rxrpc_call_see_connect_failed); rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); _leave(" = c=%08x [err]", call->debug_id); return call; } /* * Set up an incoming call. call->conn points to the connection. * This is called with interrupts disabled and isn't allowed to fail. */ void rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_connection *conn = call->conn; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); u32 chan; _enter(",%d", call->conn->debug_id); rcu_assign_pointer(call->socket, rx); call->call_id = sp->hdr.callNumber; call->dest_srx.srx_service = sp->hdr.serviceId; call->cid = sp->hdr.cid; call->cong_tstamp = skb->tstamp; __set_bit(RXRPC_CALL_EXPOSED, &call->flags); rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); spin_lock(&conn->state_lock); switch (conn->state) { case RXRPC_CONN_SERVICE_UNSECURED: case RXRPC_CONN_SERVICE_CHALLENGING: rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); break; case RXRPC_CONN_SERVICE: rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); break; case RXRPC_CONN_ABORTED: rxrpc_set_call_completion(call, conn->completion, conn->abort_code, conn->error); break; default: BUG(); } rxrpc_get_call(call, rxrpc_call_get_io_thread); /* Set the channel for this call. We don't get channel_lock as we're * only defending against the data_ready handler (which we're called * from) and the RESPONSE packet parser (which is only really * interested in call_counter and can cope with a disagreement with the * call pointer). */ chan = sp->hdr.cid & RXRPC_CHANNELMASK; conn->channels[chan].call_counter = call->call_id; conn->channels[chan].call_id = call->call_id; conn->channels[chan].call = call; spin_unlock(&conn->state_lock); spin_lock(&conn->peer->lock); hlist_add_head(&call->error_link, &conn->peer->error_targets); spin_unlock(&conn->peer->lock); rxrpc_start_call_timer(call); _leave(""); } /* * Note the re-emergence of a call. */ void rxrpc_see_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { if (call) { int r = refcount_read(&call->ref); trace_rxrpc_call(call->debug_id, r, 0, why); } } struct rxrpc_call *rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { int r; if (!call || !__refcount_inc_not_zero(&call->ref, &r)) return NULL; trace_rxrpc_call(call->debug_id, r + 1, 0, why); return call; } /* * Note the addition of a ref on a call. */ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { int r; __refcount_inc(&call->ref, &r); trace_rxrpc_call(call->debug_id, r + 1, 0, why); } /* * Clean up the transmission buffers. */ static void rxrpc_cleanup_tx_buffers(struct rxrpc_call *call) { struct rxrpc_txqueue *tq, *next; for (tq = call->tx_queue; tq; tq = next) { next = tq->next; for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) if (tq->bufs[i]) rxrpc_put_txbuf(tq->bufs[i], rxrpc_txbuf_put_cleaned); trace_rxrpc_tq(call, tq, 0, rxrpc_tq_cleaned); kfree(tq); } } /* * Clean up the receive buffers. */ static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call) { rxrpc_purge_queue(&call->recvmsg_queue); rxrpc_purge_queue(&call->rx_queue); rxrpc_purge_queue(&call->rx_oos_queue); } /* * Detach a call from its owning socket. */ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; bool put = false, putu = false; _enter("{%d,%d}", call->debug_id, refcount_read(&call->ref)); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), call->flags, rxrpc_call_see_release); if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); rxrpc_put_call_slot(call); /* Make sure we don't get any more notifications */ spin_lock_irq(&rx->recvmsg_lock); if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", call, call->events, call->flags); list_del(&call->recvmsg_link); put = true; } /* list_empty() must return false in rxrpc_notify_socket() */ call->recvmsg_link.next = NULL; call->recvmsg_link.prev = NULL; spin_unlock_irq(&rx->recvmsg_lock); if (put) rxrpc_put_call(call, rxrpc_call_put_unnotify); write_lock(&rx->call_lock); if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { rb_erase(&call->sock_node, &rx->calls); memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); putu = true; } list_del(&call->sock_link); write_unlock(&rx->call_lock); _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); if (putu) rxrpc_put_call(call, rxrpc_call_put_userid); _leave(""); } /* * release all the calls associated with a socket */ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) { struct rxrpc_call *call; _enter("%p", rx); while (!list_empty(&rx->to_be_accepted)) { call = list_entry(rx->to_be_accepted.next, struct rxrpc_call, accept_link); list_del(&call->accept_link); rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, rxrpc_abort_call_sock_release_tba); rxrpc_put_call(call, rxrpc_call_put_release_sock_tba); } while (!list_empty(&rx->sock_calls)) { call = list_entry(rx->sock_calls.next, struct rxrpc_call, sock_link); rxrpc_get_call(call, rxrpc_call_get_release_sock); rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, rxrpc_abort_call_sock_release); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put_release_sock); } _leave(""); } /* * release a call */ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { struct rxrpc_net *rxnet = call->rxnet; unsigned int debug_id = call->debug_id; bool dead; int r; ASSERT(call != NULL); dead = __refcount_dec_and_test(&call->ref, &r); trace_rxrpc_call(debug_id, r - 1, 0, why); if (dead) { ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); if (!list_empty(&call->link)) { spin_lock(&rxnet->call_lock); list_del_init(&call->link); spin_unlock(&rxnet->call_lock); } rxrpc_cleanup_call(call); } } /* * Free up the call under RCU. */ static void rxrpc_rcu_free_call(struct rcu_head *rcu) { struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); struct rxrpc_net *rxnet = READ_ONCE(call->rxnet); kmem_cache_free(rxrpc_call_jar, call); if (atomic_dec_and_test(&rxnet->nr_calls)) wake_up_var(&rxnet->nr_calls); } /* * Final call destruction - but must be done in process context. */ static void rxrpc_destroy_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer); del_timer_sync(&call->timer); rxrpc_cleanup_tx_buffers(call); rxrpc_cleanup_rx_buffers(call); rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); rxrpc_deactivate_bundle(call->bundle); rxrpc_put_bundle(call->bundle, rxrpc_bundle_put_call); rxrpc_put_peer(call->peer, rxrpc_peer_put_call); rxrpc_put_local(call->local, rxrpc_local_put_call); call_rcu(&call->rcu, rxrpc_rcu_free_call); } /* * clean up a call */ void rxrpc_cleanup_call(struct rxrpc_call *call) { memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); del_timer(&call->timer); if (rcu_read_lock_held()) /* Can't use the rxrpc workqueue as we need to cancel/flush * something that may be running/waiting there. */ schedule_work(&call->destroyer); else rxrpc_destroy_call(&call->destroyer); } /* * Make sure that all calls are gone from a network namespace. To reach this * point, any open UDP sockets in that namespace must have been closed, so any * outstanding calls cannot be doing I/O. */ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) { struct rxrpc_call *call; _enter(""); if (!list_empty(&rxnet->calls)) { spin_lock(&rxnet->call_lock); while (!list_empty(&rxnet->calls)) { call = list_entry(rxnet->calls.next, struct rxrpc_call, link); _debug("Zapping call %p", call); rxrpc_see_call(call, rxrpc_call_see_zap); list_del_init(&call->link); pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", call, refcount_read(&call->ref), rxrpc_call_states[__rxrpc_call_state(call)], call->flags, call->events); spin_unlock(&rxnet->call_lock); cond_resched(); spin_lock(&rxnet->call_lock); } spin_unlock(&rxnet->call_lock); } atomic_dec(&rxnet->nr_calls); wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); }
6 6 6 6 130 129 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI sockets. */ #include <linux/compat.h> #include <linux/export.h> #include <linux/utsname.h> #include <linux/sched.h> #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/hci_mon.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" static LIST_HEAD(mgmt_chan_list); static DEFINE_MUTEX(mgmt_chan_list_lock); static DEFINE_IDA(sock_cookie_ida); static atomic_t monitor_promisc = ATOMIC_INIT(0); /* ----- HCI socket interface ----- */ /* Socket info */ #define hci_pi(sk) ((struct hci_pinfo *) sk) struct hci_pinfo { struct bt_sock bt; struct hci_dev *hdev; struct hci_filter filter; __u8 cmsg_mask; unsigned short channel; unsigned long flags; __u32 cookie; char comm[TASK_COMM_LEN]; __u16 mtu; }; static struct hci_dev *hci_hdev_from_sock(struct sock *sk) { struct hci_dev *hdev = hci_pi(sk)->hdev; if (!hdev) return ERR_PTR(-EBADFD); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) return ERR_PTR(-EPIPE); return hdev; } void hci_sock_set_flag(struct sock *sk, int nr) { set_bit(nr, &hci_pi(sk)->flags); } void hci_sock_clear_flag(struct sock *sk, int nr) { clear_bit(nr, &hci_pi(sk)->flags); } int hci_sock_test_flag(struct sock *sk, int nr) { return test_bit(nr, &hci_pi(sk)->flags); } unsigned short hci_sock_get_channel(struct sock *sk) { return hci_pi(sk)->channel; } u32 hci_sock_get_cookie(struct sock *sk) { return hci_pi(sk)->cookie; } static bool hci_sock_gen_cookie(struct sock *sk) { int id = hci_pi(sk)->cookie; if (!id) { id = ida_alloc_min(&sock_cookie_ida, 1, GFP_KERNEL); if (id < 0) id = 0xffffffff; hci_pi(sk)->cookie = id; get_task_comm(hci_pi(sk)->comm, current); return true; } return false; } static void hci_sock_free_cookie(struct sock *sk) { int id = hci_pi(sk)->cookie; if (id) { hci_pi(sk)->cookie = 0xffffffff; ida_free(&sock_cookie_ida, id); } } static inline int hci_test_bit(int nr, const void *addr) { return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); } /* Security filter */ #define HCI_SFLT_MAX_OGF 5 struct hci_sec_filter { __u32 type_mask; __u32 event_mask[2]; __u32 ocf_mask[HCI_SFLT_MAX_OGF + 1][4]; }; static const struct hci_sec_filter hci_sec_filter = { /* Packet types */ 0x10, /* Events */ { 0x1000d9fe, 0x0000b00c }, /* Commands */ { { 0x0 }, /* OGF_LINK_CTL */ { 0xbe000006, 0x00000001, 0x00000000, 0x00 }, /* OGF_LINK_POLICY */ { 0x00005200, 0x00000000, 0x00000000, 0x00 }, /* OGF_HOST_CTL */ { 0xaab00200, 0x2b402aaa, 0x05220154, 0x00 }, /* OGF_INFO_PARAM */ { 0x000002be, 0x00000000, 0x00000000, 0x00 }, /* OGF_STATUS_PARAM */ { 0x000000ea, 0x00000000, 0x00000000, 0x00 } } }; static struct bt_sock_list hci_sk_list = { .lock = __RW_LOCK_UNLOCKED(hci_sk_list.lock) }; static bool is_filtered_packet(struct sock *sk, struct sk_buff *skb) { struct hci_filter *flt; int flt_type, flt_event; /* Apply filter */ flt = &hci_pi(sk)->filter; flt_type = hci_skb_pkt_type(skb) & HCI_FLT_TYPE_BITS; if (!test_bit(flt_type, &flt->type_mask)) return true; /* Extra filter for event packets only */ if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT) return false; flt_event = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS); if (!hci_test_bit(flt_event, &flt->event_mask)) return true; /* Check filter only when opcode is set */ if (!flt->opcode) return false; if (flt_event == HCI_EV_CMD_COMPLETE && flt->opcode != get_unaligned((__le16 *)(skb->data + 3))) return true; if (flt_event == HCI_EV_CMD_STATUS && flt->opcode != get_unaligned((__le16 *)(skb->data + 4))) return true; return false; } /* Send frame to RAW socket */ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) { struct sock *sk; struct sk_buff *skb_copy = NULL; BT_DBG("hdev %p len %d", hdev, skb->len); read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *nskb; if (sk->sk_state != BT_BOUND || hci_pi(sk)->hdev != hdev) continue; /* Don't send frame to the socket it came from */ if (skb->sk == sk) continue; if (hci_pi(sk)->channel == HCI_CHANNEL_RAW) { if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) continue; if (is_filtered_packet(sk, skb)) continue; } else if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { if (!bt_cb(skb)->incoming) continue; if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) continue; } else { /* Don't send frame to other channel types */ continue; } if (!skb_copy) { /* Create a private copy with headroom */ skb_copy = __pskb_copy_fclone(skb, 1, GFP_ATOMIC, true); if (!skb_copy) continue; /* Put type byte before the data */ memcpy(skb_push(skb_copy, 1), &hci_skb_pkt_type(skb), 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } read_unlock(&hci_sk_list.lock); kfree_skb(skb_copy); } static void hci_sock_copy_creds(struct sock *sk, struct sk_buff *skb) { struct scm_creds *creds; if (!sk || WARN_ON(!skb)) return; creds = &bt_cb(skb)->creds; /* Check if peer credentials is set */ if (!sk->sk_peer_pid) { /* Check if parent peer credentials is set */ if (bt_sk(sk)->parent && bt_sk(sk)->parent->sk_peer_pid) sk = bt_sk(sk)->parent; else return; } /* Check if scm_creds already set */ if (creds->pid == pid_vnr(sk->sk_peer_pid)) return; memset(creds, 0, sizeof(*creds)); creds->pid = pid_vnr(sk->sk_peer_pid); if (sk->sk_peer_cred) { creds->uid = sk->sk_peer_cred->uid; creds->gid = sk->sk_peer_cred->gid; } } static struct sk_buff *hci_skb_clone(struct sk_buff *skb) { struct sk_buff *nskb; if (!skb) return NULL; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return NULL; hci_sock_copy_creds(skb->sk, nskb); return nskb; } /* Send frame to sockets with specific channel */ static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk) { struct sock *sk; BT_DBG("channel %u len %d", channel, skb->len); sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *nskb; /* Ignore socket without the flag set */ if (!hci_sock_test_flag(sk, flag)) continue; /* Skip the original socket */ if (sk == skip_sk) continue; if (sk->sk_state != BT_BOUND) continue; if (hci_pi(sk)->channel != channel) continue; nskb = hci_skb_clone(skb); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } } void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk) { read_lock(&hci_sk_list.lock); __hci_send_to_channel(channel, skb, flag, skip_sk); read_unlock(&hci_sk_list.lock); } /* Send frame to monitor socket */ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) { struct sk_buff *skb_copy = NULL; struct hci_mon_hdr *hdr; __le16 opcode; if (!atomic_read(&monitor_promisc)) return; BT_DBG("hdev %p len %d", hdev, skb->len); switch (hci_skb_pkt_type(skb)) { case HCI_COMMAND_PKT: opcode = cpu_to_le16(HCI_MON_COMMAND_PKT); break; case HCI_EVENT_PKT: opcode = cpu_to_le16(HCI_MON_EVENT_PKT); break; case HCI_ACLDATA_PKT: if (bt_cb(skb)->incoming) opcode = cpu_to_le16(HCI_MON_ACL_RX_PKT); else opcode = cpu_to_le16(HCI_MON_ACL_TX_PKT); break; case HCI_SCODATA_PKT: if (bt_cb(skb)->incoming) opcode = cpu_to_le16(HCI_MON_SCO_RX_PKT); else opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT); break; case HCI_ISODATA_PKT: if (bt_cb(skb)->incoming) opcode = cpu_to_le16(HCI_MON_ISO_RX_PKT); else opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT); break; case HCI_DIAG_PKT: opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG); break; default: return; } /* Create a private copy with headroom */ skb_copy = __pskb_copy_fclone(skb, HCI_MON_HDR_SIZE, GFP_ATOMIC, true); if (!skb_copy) return; hci_sock_copy_creds(skb->sk, skb_copy); /* Put header before the data */ hdr = skb_push(skb_copy, HCI_MON_HDR_SIZE); hdr->opcode = opcode; hdr->index = cpu_to_le16(hdev->id); hdr->len = cpu_to_le16(skb->len); hci_send_to_channel(HCI_CHANNEL_MONITOR, skb_copy, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb_copy); } void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event, void *data, u16 data_len, ktime_t tstamp, int flag, struct sock *skip_sk) { struct sock *sk; __le16 index; if (hdev) index = cpu_to_le16(hdev->id); else index = cpu_to_le16(MGMT_INDEX_NONE); read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { struct hci_mon_hdr *hdr; struct sk_buff *skb; if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL) continue; /* Ignore socket without the flag set */ if (!hci_sock_test_flag(sk, flag)) continue; /* Skip the original socket */ if (sk == skip_sk) continue; skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC); if (!skb) continue; put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); put_unaligned_le16(event, skb_put(skb, 2)); if (data) skb_put_data(skb, data, data_len); skb->tstamp = tstamp; hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); hdr->index = index; hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } read_unlock(&hci_sk_list.lock); } static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) { struct hci_mon_hdr *hdr; struct hci_mon_new_index *ni; struct hci_mon_index_info *ii; struct sk_buff *skb; __le16 opcode; switch (event) { case HCI_DEV_REG: skb = bt_skb_alloc(HCI_MON_NEW_INDEX_SIZE, GFP_ATOMIC); if (!skb) return NULL; ni = skb_put(skb, HCI_MON_NEW_INDEX_SIZE); ni->type = 0x00; /* Old hdev->dev_type */ ni->bus = hdev->bus; bacpy(&ni->bdaddr, &hdev->bdaddr); memcpy_and_pad(ni->name, sizeof(ni->name), hdev->name, strnlen(hdev->name, sizeof(ni->name)), '\0'); opcode = cpu_to_le16(HCI_MON_NEW_INDEX); break; case HCI_DEV_UNREG: skb = bt_skb_alloc(0, GFP_ATOMIC); if (!skb) return NULL; opcode = cpu_to_le16(HCI_MON_DEL_INDEX); break; case HCI_DEV_SETUP: if (hdev->manufacturer == 0xffff) return NULL; fallthrough; case HCI_DEV_UP: skb = bt_skb_alloc(HCI_MON_INDEX_INFO_SIZE, GFP_ATOMIC); if (!skb) return NULL; ii = skb_put(skb, HCI_MON_INDEX_INFO_SIZE); bacpy(&ii->bdaddr, &hdev->bdaddr); ii->manufacturer = cpu_to_le16(hdev->manufacturer); opcode = cpu_to_le16(HCI_MON_INDEX_INFO); break; case HCI_DEV_OPEN: skb = bt_skb_alloc(0, GFP_ATOMIC); if (!skb) return NULL; opcode = cpu_to_le16(HCI_MON_OPEN_INDEX); break; case HCI_DEV_CLOSE: skb = bt_skb_alloc(0, GFP_ATOMIC); if (!skb) return NULL; opcode = cpu_to_le16(HCI_MON_CLOSE_INDEX); break; default: return NULL; } __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = opcode; hdr->index = cpu_to_le16(hdev->id); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) { struct hci_mon_hdr *hdr; struct sk_buff *skb; u16 format; u8 ver[3]; u32 flags; /* No message needed when cookie is not present */ if (!hci_pi(sk)->cookie) return NULL; switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: format = 0x0000; ver[0] = BT_SUBSYS_VERSION; put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1); break; case HCI_CHANNEL_USER: format = 0x0001; ver[0] = BT_SUBSYS_VERSION; put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1); break; case HCI_CHANNEL_CONTROL: format = 0x0002; mgmt_fill_version_info(ver); break; default: /* No message for unsupported format */ return NULL; } skb = bt_skb_alloc(14 + TASK_COMM_LEN, GFP_ATOMIC); if (!skb) return NULL; hci_sock_copy_creds(sk, skb); flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0; put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); put_unaligned_le16(format, skb_put(skb, 2)); skb_put_data(skb, ver, sizeof(ver)); put_unaligned_le32(flags, skb_put(skb, 4)); skb_put_u8(skb, TASK_COMM_LEN); skb_put_data(skb, hci_pi(sk)->comm, TASK_COMM_LEN); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN); if (hci_pi(sk)->hdev) hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id); else hdr->index = cpu_to_le16(HCI_DEV_NONE); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) { struct hci_mon_hdr *hdr; struct sk_buff *skb; /* No message needed when cookie is not present */ if (!hci_pi(sk)->cookie) return NULL; switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: case HCI_CHANNEL_CONTROL: break; default: /* No message for unsupported format */ return NULL; } skb = bt_skb_alloc(4, GFP_ATOMIC); if (!skb) return NULL; hci_sock_copy_creds(sk, skb); put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE); if (hci_pi(sk)->hdev) hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id); else hdr->index = cpu_to_le16(HCI_DEV_NONE); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index, u16 opcode, u16 len, const void *buf) { struct hci_mon_hdr *hdr; struct sk_buff *skb; skb = bt_skb_alloc(6 + len, GFP_ATOMIC); if (!skb) return NULL; hci_sock_copy_creds(sk, skb); put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); put_unaligned_le16(opcode, skb_put(skb, 2)); if (buf) skb_put_data(skb, buf, len); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND); hdr->index = cpu_to_le16(index); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static void __printf(2, 3) send_monitor_note(struct sock *sk, const char *fmt, ...) { size_t len; struct hci_mon_hdr *hdr; struct sk_buff *skb; va_list args; va_start(args, fmt); len = vsnprintf(NULL, 0, fmt, args); va_end(args); skb = bt_skb_alloc(len + 1, GFP_ATOMIC); if (!skb) return; hci_sock_copy_creds(sk, skb); va_start(args, fmt); vsprintf(skb_put(skb, len), fmt, args); *(u8 *)skb_put(skb, 1) = 0; va_end(args); __net_timestamp(skb); hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_SYSTEM_NOTE); hdr->index = cpu_to_le16(HCI_DEV_NONE); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); } static void send_monitor_replay(struct sock *sk) { struct hci_dev *hdev; read_lock(&hci_dev_list_lock); list_for_each_entry(hdev, &hci_dev_list, list) { struct sk_buff *skb; skb = create_monitor_event(hdev, HCI_DEV_REG); if (!skb) continue; if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); if (!test_bit(HCI_RUNNING, &hdev->flags)) continue; skb = create_monitor_event(hdev, HCI_DEV_OPEN); if (!skb) continue; if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); if (test_bit(HCI_UP, &hdev->flags)) skb = create_monitor_event(hdev, HCI_DEV_UP); else if (hci_dev_test_flag(hdev, HCI_SETUP)) skb = create_monitor_event(hdev, HCI_DEV_SETUP); else skb = NULL; if (skb) { if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); } } read_unlock(&hci_dev_list_lock); } static void send_monitor_control_replay(struct sock *mon_sk) { struct sock *sk; read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *skb; skb = create_monitor_ctrl_open(sk); if (!skb) continue; if (sock_queue_rcv_skb(mon_sk, skb)) kfree_skb(skb); } read_unlock(&hci_sk_list.lock); } /* Generate internal stack event */ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) { struct hci_event_hdr *hdr; struct hci_ev_stack_internal *ev; struct sk_buff *skb; skb = bt_skb_alloc(HCI_EVENT_HDR_SIZE + sizeof(*ev) + dlen, GFP_ATOMIC); if (!skb) return; hdr = skb_put(skb, HCI_EVENT_HDR_SIZE); hdr->evt = HCI_EV_STACK_INTERNAL; hdr->plen = sizeof(*ev) + dlen; ev = skb_put(skb, sizeof(*ev) + dlen); ev->type = type; memcpy(ev->data, data, dlen); bt_cb(skb)->incoming = 1; __net_timestamp(skb); hci_skb_pkt_type(skb) = HCI_EVENT_PKT; hci_send_to_sock(hdev, skb); kfree_skb(skb); } void hci_sock_dev_event(struct hci_dev *hdev, int event) { BT_DBG("hdev %s event %d", hdev->name, event); if (atomic_read(&monitor_promisc)) { struct sk_buff *skb; /* Send event to monitor */ skb = create_monitor_event(hdev, event); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } if (event <= HCI_DEV_DOWN) { struct hci_ev_si_device ev; /* Send event to sockets */ ev.event = event; ev.dev_id = hdev->id; hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev); } if (event == HCI_DEV_UNREG) { struct sock *sk; /* Wake up sockets using this dead device */ read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { if (hci_pi(sk)->hdev == hdev) { sk->sk_err = EPIPE; sk->sk_state_change(sk); } } read_unlock(&hci_sk_list.lock); } } static struct hci_mgmt_chan *__hci_mgmt_chan_find(unsigned short channel) { struct hci_mgmt_chan *c; list_for_each_entry(c, &mgmt_chan_list, list) { if (c->channel == channel) return c; } return NULL; } static struct hci_mgmt_chan *hci_mgmt_chan_find(unsigned short channel) { struct hci_mgmt_chan *c; mutex_lock(&mgmt_chan_list_lock); c = __hci_mgmt_chan_find(channel); mutex_unlock(&mgmt_chan_list_lock); return c; } int hci_mgmt_chan_register(struct hci_mgmt_chan *c) { if (c->channel < HCI_CHANNEL_CONTROL) return -EINVAL; mutex_lock(&mgmt_chan_list_lock); if (__hci_mgmt_chan_find(c->channel)) { mutex_unlock(&mgmt_chan_list_lock); return -EALREADY; } list_add_tail(&c->list, &mgmt_chan_list); mutex_unlock(&mgmt_chan_list_lock); return 0; } EXPORT_SYMBOL(hci_mgmt_chan_register); void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c) { mutex_lock(&mgmt_chan_list_lock); list_del(&c->list); mutex_unlock(&mgmt_chan_list_lock); } EXPORT_SYMBOL(hci_mgmt_chan_unregister); static int hci_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct hci_dev *hdev; struct sk_buff *skb; BT_DBG("sock %p sk %p", sock, sk); if (!sk) return 0; lock_sock(sk); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_MONITOR: atomic_dec(&monitor_promisc); break; case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: case HCI_CHANNEL_CONTROL: /* Send event to monitor */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } hci_sock_free_cookie(sk); break; } bt_sock_unlink(&hci_sk_list, sk); hdev = hci_pi(sk)->hdev; if (hdev) { if (hci_pi(sk)->channel == HCI_CHANNEL_USER && !hci_dev_test_flag(hdev, HCI_UNREGISTER)) { /* When releasing a user channel exclusive access, * call hci_dev_do_close directly instead of calling * hci_dev_close to ensure the exclusive access will * be released and the controller brought back down. * * The checking of HCI_AUTO_OFF is not needed in this * case since it will have been cleared already when * opening the user channel. * * Make sure to also check that we haven't already * unregistered since all the cleanup will have already * been complete and hdev will get released when we put * below. */ hci_dev_do_close(hdev); hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); mgmt_index_added(hdev); } atomic_dec(&hdev->promisc); hci_dev_put(hdev); } sock_orphan(sk); release_sock(sk); sock_put(sk); return 0; } static int hci_sock_reject_list_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; int err; if (copy_from_user(&bdaddr, arg, sizeof(bdaddr))) return -EFAULT; hci_dev_lock(hdev); err = hci_bdaddr_list_add(&hdev->reject_list, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); return err; } static int hci_sock_reject_list_del(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; int err; if (copy_from_user(&bdaddr, arg, sizeof(bdaddr))) return -EFAULT; hci_dev_lock(hdev); err = hci_bdaddr_list_del(&hdev->reject_list, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); return err; } /* Ioctls that require bound socket */ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { struct hci_dev *hdev = hci_hdev_from_sock(sk); if (IS_ERR(hdev)) return PTR_ERR(hdev); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return -EOPNOTSUPP; switch (cmd) { case HCISETRAW: if (!capable(CAP_NET_ADMIN)) return -EPERM; return -EOPNOTSUPP; case HCIGETCONNINFO: return hci_get_conn_info(hdev, (void __user *)arg); case HCIGETAUTHINFO: return hci_get_auth_info(hdev, (void __user *)arg); case HCIBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_sock_reject_list_add(hdev, (void __user *)arg); case HCIUNBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_sock_reject_list_del(hdev, (void __user *)arg); } return -ENOIOCTLCMD; } static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct sock *sk = sock->sk; int err; BT_DBG("cmd %x arg %lx", cmd, arg); /* Make sure the cmd is valid before doing anything */ switch (cmd) { case HCIGETDEVLIST: case HCIGETDEVINFO: case HCIGETCONNLIST: case HCIDEVUP: case HCIDEVDOWN: case HCIDEVRESET: case HCIDEVRESTAT: case HCISETSCAN: case HCISETAUTH: case HCISETENCRYPT: case HCISETPTYPE: case HCISETLINKPOL: case HCISETLINKMODE: case HCISETACLMTU: case HCISETSCOMTU: case HCIINQUIRY: case HCISETRAW: case HCIGETCONNINFO: case HCIGETAUTHINFO: case HCIBLOCKADDR: case HCIUNBLOCKADDR: break; default: return -ENOIOCTLCMD; } lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { err = -EBADFD; goto done; } /* When calling an ioctl on an unbound raw socket, then ensure * that the monitor gets informed. Ensure that the resulting event * is only send once by checking if the cookie exists or not. The * socket cookie will be only ever generated once for the lifetime * of a given socket. */ if (hci_sock_gen_cookie(sk)) { struct sk_buff *skb; /* Perform careful checks before setting the HCI_SOCK_TRUSTED * flag. Make sure that not only the current task but also * the socket opener has the required capability, since * privileged programs can be tricked into making ioctl calls * on HCI sockets, and the socket should not be marked as * trusted simply because the ioctl caller is privileged. */ if (sk_capable(sk, CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } release_sock(sk); switch (cmd) { case HCIGETDEVLIST: return hci_get_dev_list(argp); case HCIGETDEVINFO: return hci_get_dev_info(argp); case HCIGETCONNLIST: return hci_get_conn_list(argp); case HCIDEVUP: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_open(arg); case HCIDEVDOWN: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_close(arg); case HCIDEVRESET: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_reset(arg); case HCIDEVRESTAT: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_reset_stat(arg); case HCISETSCAN: case HCISETAUTH: case HCISETENCRYPT: case HCISETPTYPE: case HCISETLINKPOL: case HCISETLINKMODE: case HCISETACLMTU: case HCISETSCOMTU: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_cmd(cmd, argp); case HCIINQUIRY: return hci_inquiry(argp); } lock_sock(sk); err = hci_sock_bound_ioctl(sk, cmd, arg); done: release_sock(sk); return err; } #ifdef CONFIG_COMPAT static int hci_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { switch (cmd) { case HCIDEVUP: case HCIDEVDOWN: case HCIDEVRESET: case HCIDEVRESTAT: return hci_sock_ioctl(sock, cmd, arg); } return hci_sock_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } #endif static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_hci haddr; struct sock *sk = sock->sk; struct hci_dev *hdev = NULL; struct sk_buff *skb; int len, err = 0; BT_DBG("sock %p sk %p", sock, sk); if (!addr) return -EINVAL; memset(&haddr, 0, sizeof(haddr)); len = min_t(unsigned int, sizeof(haddr), addr_len); memcpy(&haddr, addr, len); if (haddr.hci_family != AF_BLUETOOTH) return -EINVAL; lock_sock(sk); /* Allow detaching from dead device and attaching to alive device, if * the caller wants to re-bind (instead of close) this socket in * response to hci_sock_dev_event(HCI_DEV_UNREG) notification. */ hdev = hci_pi(sk)->hdev; if (hdev && hci_dev_test_flag(hdev, HCI_UNREGISTER)) { hci_pi(sk)->hdev = NULL; sk->sk_state = BT_OPEN; hci_dev_put(hdev); } hdev = NULL; if (sk->sk_state == BT_BOUND) { err = -EALREADY; goto done; } switch (haddr.hci_channel) { case HCI_CHANNEL_RAW: if (hci_pi(sk)->hdev) { err = -EALREADY; goto done; } if (haddr.hci_dev != HCI_DEV_NONE) { hdev = hci_dev_get(haddr.hci_dev); if (!hdev) { err = -ENODEV; goto done; } atomic_inc(&hdev->promisc); } hci_pi(sk)->channel = haddr.hci_channel; if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been assigned, * then there has been already an ioctl issued against * an unbound socket and with that triggered an open * notification. Send a close notification first to * allow the state transition to bounded. */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } if (capable(CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); hci_pi(sk)->hdev = hdev; /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } break; case HCI_CHANNEL_USER: if (hci_pi(sk)->hdev) { err = -EALREADY; goto done; } if (haddr.hci_dev == HCI_DEV_NONE) { err = -EINVAL; goto done; } if (!capable(CAP_NET_ADMIN)) { err = -EPERM; goto done; } hdev = hci_dev_get(haddr.hci_dev); if (!hdev) { err = -ENODEV; goto done; } if (test_bit(HCI_INIT, &hdev->flags) || hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) && test_bit(HCI_UP, &hdev->flags))) { err = -EBUSY; hci_dev_put(hdev); goto done; } if (hci_dev_test_and_set_flag(hdev, HCI_USER_CHANNEL)) { err = -EUSERS; hci_dev_put(hdev); goto done; } mgmt_index_removed(hdev); err = hci_dev_open(hdev->id); if (err) { if (err == -EALREADY) { /* In case the transport is already up and * running, clear the error here. * * This can happen when opening a user * channel and HCI_AUTO_OFF grace period * is still active. */ err = 0; } else { hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); mgmt_index_added(hdev); hci_dev_put(hdev); goto done; } } hci_pi(sk)->channel = haddr.hci_channel; if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been assigned, * this socket will transition from a raw socket into * a user channel socket. For a clean transition, send * the close notification first. */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } /* The user channel is restricted to CAP_NET_ADMIN * capabilities and with that implicitly trusted. */ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); hci_pi(sk)->hdev = hdev; /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } atomic_inc(&hdev->promisc); break; case HCI_CHANNEL_MONITOR: if (haddr.hci_dev != HCI_DEV_NONE) { err = -EINVAL; goto done; } if (!capable(CAP_NET_RAW)) { err = -EPERM; goto done; } hci_pi(sk)->channel = haddr.hci_channel; /* The monitor interface is restricted to CAP_NET_RAW * capabilities and with that implicitly trusted. */ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); send_monitor_note(sk, "Linux version %s (%s)", init_utsname()->release, init_utsname()->machine); send_monitor_note(sk, "Bluetooth subsystem version %u.%u", BT_SUBSYS_VERSION, BT_SUBSYS_REVISION); send_monitor_replay(sk); send_monitor_control_replay(sk); atomic_inc(&monitor_promisc); break; case HCI_CHANNEL_LOGGING: if (haddr.hci_dev != HCI_DEV_NONE) { err = -EINVAL; goto done; } if (!capable(CAP_NET_ADMIN)) { err = -EPERM; goto done; } hci_pi(sk)->channel = haddr.hci_channel; break; default: if (!hci_mgmt_chan_find(haddr.hci_channel)) { err = -EINVAL; goto done; } if (haddr.hci_dev != HCI_DEV_NONE) { err = -EINVAL; goto done; } /* Users with CAP_NET_ADMIN capabilities are allowed * access to all management commands and events. For * untrusted users the interface is restricted and * also only untrusted events are sent. */ if (capable(CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); hci_pi(sk)->channel = haddr.hci_channel; /* At the moment the index and unconfigured index events * are enabled unconditionally. Setting them on each * socket when binding keeps this functionality. They * however might be cleared later and then sending of these * events will be disabled, but that is then intentional. * * This also enables generic events that are safe to be * received by untrusted users. Example for such events * are changes to settings, class of device, name etc. */ if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) { if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been * assigned, this socket will transition from * a raw socket into a control socket. To * allow for a clean transition, send the * close notification first. */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS); } break; } /* Default MTU to HCI_MAX_FRAME_SIZE if not set */ if (!hci_pi(sk)->mtu) hci_pi(sk)->mtu = HCI_MAX_FRAME_SIZE; sk->sk_state = BT_BOUND; done: release_sock(sk); return err; } static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_hci *haddr = (struct sockaddr_hci *)addr; struct sock *sk = sock->sk; struct hci_dev *hdev; int err = 0; BT_DBG("sock %p sk %p", sock, sk); if (peer) return -EOPNOTSUPP; lock_sock(sk); hdev = hci_hdev_from_sock(sk); if (IS_ERR(hdev)) { err = PTR_ERR(hdev); goto done; } haddr->hci_family = AF_BLUETOOTH; haddr->hci_dev = hdev->id; haddr->hci_channel= hci_pi(sk)->channel; err = sizeof(*haddr); done: release_sock(sk); return err; } static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { __u8 mask = hci_pi(sk)->cmsg_mask; if (mask & HCI_CMSG_DIR) { int incoming = bt_cb(skb)->incoming; put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming); } if (mask & HCI_CMSG_TSTAMP) { #ifdef CONFIG_COMPAT struct old_timeval32 ctv; #endif struct __kernel_old_timeval tv; void *data; int len; skb_get_timestamp(skb, &tv); data = &tv; len = sizeof(tv); #ifdef CONFIG_COMPAT if (!COMPAT_USE_64BIT_TIME && (msg->msg_flags & MSG_CMSG_COMPAT)) { ctv.tv_sec = tv.tv_sec; ctv.tv_usec = tv.tv_usec; data = &ctv; len = sizeof(ctv); } #endif put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, len, data); } } static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct scm_cookie scm; struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; unsigned int skblen; BT_DBG("sock %p, sk %p", sock, sk); if (flags & MSG_OOB) return -EOPNOTSUPP; if (hci_pi(sk)->channel == HCI_CHANNEL_LOGGING) return -EOPNOTSUPP; if (sk->sk_state == BT_CLOSED) return 0; skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; skblen = skb->len; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } skb_reset_transport_header(skb); err = skb_copy_datagram_msg(skb, 0, msg, copied); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: hci_sock_cmsg(sk, msg, skb); break; case HCI_CHANNEL_USER: case HCI_CHANNEL_MONITOR: sock_recv_timestamp(msg, sk, skb); break; default: if (hci_mgmt_chan_find(hci_pi(sk)->channel)) sock_recv_timestamp(msg, sk, skb); break; } memset(&scm, 0, sizeof(scm)); scm.creds = bt_cb(skb)->creds; skb_free_datagram(sk, skb); if (flags & MSG_TRUNC) copied = skblen; scm_recv(sock, msg, &scm, flags); return err ? : copied; } static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, struct sk_buff *skb) { u8 *cp; struct mgmt_hdr *hdr; u16 opcode, index, len; struct hci_dev *hdev = NULL; const struct hci_mgmt_handler *handler; bool var_len, no_hdev; int err; BT_DBG("got %d bytes", skb->len); if (skb->len < sizeof(*hdr)) return -EINVAL; hdr = (void *)skb->data; opcode = __le16_to_cpu(hdr->opcode); index = __le16_to_cpu(hdr->index); len = __le16_to_cpu(hdr->len); if (len != skb->len - sizeof(*hdr)) { err = -EINVAL; goto done; } if (chan->channel == HCI_CHANNEL_CONTROL) { struct sk_buff *cmd; /* Send event to monitor */ cmd = create_monitor_ctrl_command(sk, index, opcode, len, skb->data + sizeof(*hdr)); if (cmd) { hci_send_to_channel(HCI_CHANNEL_MONITOR, cmd, HCI_SOCK_TRUSTED, NULL); kfree_skb(cmd); } } if (opcode >= chan->handler_count || chan->handlers[opcode].func == NULL) { BT_DBG("Unknown op %u", opcode); err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_UNKNOWN_COMMAND); goto done; } handler = &chan->handlers[opcode]; if (!hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) && !(handler->flags & HCI_MGMT_UNTRUSTED)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_PERMISSION_DENIED); goto done; } if (index != MGMT_INDEX_NONE) { hdev = hci_dev_get(index); if (!hdev) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } if (hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !(handler->flags & HCI_MGMT_UNCONFIGURED)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } } if (!(handler->flags & HCI_MGMT_HDEV_OPTIONAL)) { no_hdev = (handler->flags & HCI_MGMT_NO_HDEV); if (no_hdev != !hdev) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } } var_len = (handler->flags & HCI_MGMT_VAR_LEN); if ((var_len && len < handler->data_len) || (!var_len && len != handler->data_len)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_PARAMS); goto done; } if (hdev && chan->hdev_init) chan->hdev_init(sk, hdev); cp = skb->data + sizeof(*hdr); err = handler->func(sk, hdev, cp, len); if (err < 0) goto done; err = skb->len; done: if (hdev) hci_dev_put(hdev); return err; } static int hci_logging_frame(struct sock *sk, struct sk_buff *skb, unsigned int flags) { struct hci_mon_hdr *hdr; struct hci_dev *hdev; u16 index; int err; /* The logging frame consists at minimum of the standard header, * the priority byte, the ident length byte and at least one string * terminator NUL byte. Anything shorter are invalid packets. */ if (skb->len < sizeof(*hdr) + 3) return -EINVAL; hdr = (void *)skb->data; if (__le16_to_cpu(hdr->len) != skb->len - sizeof(*hdr)) return -EINVAL; if (__le16_to_cpu(hdr->opcode) == 0x0000) { __u8 priority = skb->data[sizeof(*hdr)]; __u8 ident_len = skb->data[sizeof(*hdr) + 1]; /* Only the priorities 0-7 are valid and with that any other * value results in an invalid packet. * * The priority byte is followed by an ident length byte and * the NUL terminated ident string. Check that the ident * length is not overflowing the packet and also that the * ident string itself is NUL terminated. In case the ident * length is zero, the length value actually doubles as NUL * terminator identifier. * * The message follows the ident string (if present) and * must be NUL terminated. Otherwise it is not a valid packet. */ if (priority > 7 || skb->data[skb->len - 1] != 0x00 || ident_len > skb->len - sizeof(*hdr) - 3 || skb->data[sizeof(*hdr) + ident_len + 1] != 0x00) return -EINVAL; } else { return -EINVAL; } index = __le16_to_cpu(hdr->index); if (index != MGMT_INDEX_NONE) { hdev = hci_dev_get(index); if (!hdev) return -ENODEV; } else { hdev = NULL; } hdr->opcode = cpu_to_le16(HCI_MON_USER_LOGGING); hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); err = skb->len; if (hdev) hci_dev_put(hdev); return err; } static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct hci_mgmt_chan *chan; struct hci_dev *hdev; struct sk_buff *skb; int err; const unsigned int flags = msg->msg_flags; BT_DBG("sock %p sk %p", sock, sk); if (flags & MSG_OOB) return -EOPNOTSUPP; if (flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (len < 4 || len > hci_pi(sk)->mtu) return -EINVAL; skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0); if (IS_ERR(skb)) return PTR_ERR(skb); lock_sock(sk); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: break; case HCI_CHANNEL_MONITOR: err = -EOPNOTSUPP; goto drop; case HCI_CHANNEL_LOGGING: err = hci_logging_frame(sk, skb, flags); goto drop; default: mutex_lock(&mgmt_chan_list_lock); chan = __hci_mgmt_chan_find(hci_pi(sk)->channel); if (chan) err = hci_mgmt_cmd(chan, sk, skb); else err = -EINVAL; mutex_unlock(&mgmt_chan_list_lock); goto drop; } hdev = hci_hdev_from_sock(sk); if (IS_ERR(hdev)) { err = PTR_ERR(hdev); goto drop; } if (!test_bit(HCI_UP, &hdev->flags)) { err = -ENETDOWN; goto drop; } hci_skb_pkt_type(skb) = skb->data[0]; skb_pull(skb, 1); if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { /* No permission check is needed for user channel * since that gets enforced when binding the socket. * * However check that the packet type is valid. */ if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { err = -EINVAL; goto drop; } skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } else if (hci_skb_pkt_type(skb) == HCI_COMMAND_PKT) { u16 opcode = get_unaligned_le16(skb->data); u16 ogf = hci_opcode_ogf(opcode); u16 ocf = hci_opcode_ocf(opcode); if (((ogf > HCI_SFLT_MAX_OGF) || !hci_test_bit(ocf & HCI_FLT_OCF_BITS, &hci_sec_filter.ocf_mask[ogf])) && !capable(CAP_NET_RAW)) { err = -EPERM; goto drop; } /* Since the opcode has already been extracted here, store * a copy of the value for later use by the drivers. */ hci_skb_opcode(skb) = opcode; if (ogf == 0x3f) { skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } else { /* Stand-alone HCI commands must be flagged as * single-command requests. */ bt_cb(skb)->hci.req_flags |= HCI_REQ_START; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); } } else { if (!capable(CAP_NET_RAW)) { err = -EPERM; goto drop; } if (hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { err = -EINVAL; goto drop; } skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } err = len; done: release_sock(sk); return err; drop: kfree_skb(skb); goto done; } static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct hci_ufilter uf = { .opcode = 0 }; struct sock *sk = sock->sk; int err = 0, opt = 0; BT_DBG("sk %p, opt %d", sk, optname); lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { err = -EBADFD; goto done; } switch (optname) { case HCI_DATA_DIR: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR; else hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_DIR; break; case HCI_TIME_STAMP: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP; else hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_TSTAMP; break; case HCI_FILTER: { struct hci_filter *f = &hci_pi(sk)->filter; uf.type_mask = f->type_mask; uf.opcode = f->opcode; uf.event_mask[0] = *((u32 *) f->event_mask + 0); uf.event_mask[1] = *((u32 *) f->event_mask + 1); } err = copy_safe_from_sockptr(&uf, sizeof(uf), optval, optlen); if (err) break; if (!capable(CAP_NET_RAW)) { uf.type_mask &= hci_sec_filter.type_mask; uf.event_mask[0] &= *((u32 *) hci_sec_filter.event_mask + 0); uf.event_mask[1] &= *((u32 *) hci_sec_filter.event_mask + 1); } { struct hci_filter *f = &hci_pi(sk)->filter; f->type_mask = uf.type_mask; f->opcode = uf.opcode; *((u32 *) f->event_mask + 0) = uf.event_mask[0]; *((u32 *) f->event_mask + 1) = uf.event_mask[1]; } break; default: err = -ENOPROTOOPT; break; } done: release_sock(sk); return err; } static int hci_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; u16 opt; BT_DBG("sk %p, opt %d", sk, optname); if (level == SOL_HCI) return hci_sock_setsockopt_old(sock, level, optname, optval, optlen); if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case BT_SNDMTU: case BT_RCVMTU: switch (hci_pi(sk)->channel) { /* Don't allow changing MTU for channels that are meant for HCI * traffic only. */ case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: err = -ENOPROTOOPT; goto done; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; hci_pi(sk)->mtu = opt; break; default: err = -ENOPROTOOPT; break; } done: release_sock(sk); return err; } static int hci_sock_getsockopt_old(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct hci_ufilter uf; struct sock *sk = sock->sk; int len, opt, err = 0; BT_DBG("sk %p, opt %d", sk, optname); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { err = -EBADFD; goto done; } switch (optname) { case HCI_DATA_DIR: if (hci_pi(sk)->cmsg_mask & HCI_CMSG_DIR) opt = 1; else opt = 0; if (put_user(opt, optval)) err = -EFAULT; break; case HCI_TIME_STAMP: if (hci_pi(sk)->cmsg_mask & HCI_CMSG_TSTAMP) opt = 1; else opt = 0; if (put_user(opt, optval)) err = -EFAULT; break; case HCI_FILTER: { struct hci_filter *f = &hci_pi(sk)->filter; memset(&uf, 0, sizeof(uf)); uf.type_mask = f->type_mask; uf.opcode = f->opcode; uf.event_mask[0] = *((u32 *) f->event_mask + 0); uf.event_mask[1] = *((u32 *) f->event_mask + 1); } len = min_t(unsigned int, len, sizeof(uf)); if (copy_to_user(optval, &uf, len)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } done: release_sock(sk); return err; } static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p, opt %d", sk, optname); if (level == SOL_HCI) return hci_sock_getsockopt_old(sock, level, optname, optval, optlen); if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case BT_SNDMTU: case BT_RCVMTU: if (put_user(hci_pi(sk)->mtu, (u16 __user *)optval)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static void hci_sock_destruct(struct sock *sk) { mgmt_cleanup(sk); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static const struct proto_ops hci_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hci_sock_release, .bind = hci_sock_bind, .getname = hci_sock_getname, .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, .ioctl = hci_sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = hci_sock_compat_ioctl, #endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = hci_sock_setsockopt, .getsockopt = hci_sock_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static struct proto hci_sk_proto = { .name = "HCI", .owner = THIS_MODULE, .obj_size = sizeof(struct hci_pinfo) }; static int hci_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; sock->ops = &hci_sock_ops; sk = bt_sock_alloc(net, sock, &hci_sk_proto, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; sock->state = SS_UNCONNECTED; sk->sk_destruct = hci_sock_destruct; bt_sock_link(&hci_sk_list, sk); return 0; } static const struct net_proto_family hci_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = hci_sock_create, }; int __init hci_sock_init(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_hci) > sizeof(struct sockaddr)); err = proto_register(&hci_sk_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_HCI, &hci_sock_family_ops); if (err < 0) { BT_ERR("HCI socket registration failed"); goto error; } err = bt_procfs_init(&init_net, "hci", &hci_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create HCI proc file"); bt_sock_unregister(BTPROTO_HCI); goto error; } BT_INFO("HCI socket layer initialized"); return 0; error: proto_unregister(&hci_sk_proto); return err; } void hci_sock_cleanup(void) { bt_procfs_cleanup(&init_net, "hci"); bt_sock_unregister(BTPROTO_HCI); proto_unregister(&hci_sk_proto); }
3 3 2 3 5 3 3 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Matthias Schiffer */ #include "netlink.h" #include "main.h" #include <linux/array_size.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> #include <net/genetlink.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "gateway_common.h" #include "hard-interface.h" #include "log.h" #include "multicast.h" #include "network-coding.h" #include "originator.h" #include "soft-interface.h" #include "tp_meter.h" #include "translation-table.h" struct genl_family batadv_netlink_family; /* multicast groups */ enum batadv_netlink_multicast_groups { BATADV_NL_MCGRP_CONFIG, BATADV_NL_MCGRP_TPMETER, }; /** * enum batadv_genl_ops_flags - flags for genl_ops's internal_flags */ enum batadv_genl_ops_flags { /** * @BATADV_FLAG_NEED_MESH: request requires valid soft interface in * attribute BATADV_ATTR_MESH_IFINDEX and expects a pointer to it to be * saved in info->user_ptr[0] */ BATADV_FLAG_NEED_MESH = BIT(0), /** * @BATADV_FLAG_NEED_HARDIF: request requires valid hard interface in * attribute BATADV_ATTR_HARD_IFINDEX and expects a pointer to it to be * saved in info->user_ptr[1] */ BATADV_FLAG_NEED_HARDIF = BIT(1), /** * @BATADV_FLAG_NEED_VLAN: request requires valid vlan in * attribute BATADV_ATTR_VLANID and expects a pointer to it to be * saved in info->user_ptr[1] */ BATADV_FLAG_NEED_VLAN = BIT(2), }; static const struct genl_multicast_group batadv_netlink_mcgrps[] = { [BATADV_NL_MCGRP_CONFIG] = { .name = BATADV_NL_MCAST_GROUP_CONFIG }, [BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER }, }; static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING }, [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 }, [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_TQ] = { .type = NLA_U8 }, [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, [BATADV_ATTR_DAT_CACHE_IP4ADDRESS] = { .type = NLA_U32 }, [BATADV_ATTR_DAT_CACHE_HWADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_DAT_CACHE_VID] = { .type = NLA_U16 }, [BATADV_ATTR_MCAST_FLAGS] = { .type = NLA_U32 }, [BATADV_ATTR_MCAST_FLAGS_PRIV] = { .type = NLA_U32 }, [BATADV_ATTR_VLANID] = { .type = NLA_U16 }, [BATADV_ATTR_AGGREGATED_OGMS_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_AP_ISOLATION_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ISOLATION_MARK] = { .type = NLA_U32 }, [BATADV_ATTR_ISOLATION_MASK] = { .type = NLA_U32 }, [BATADV_ATTR_BONDING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_FRAGMENTATION_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_GW_BANDWIDTH_DOWN] = { .type = NLA_U32 }, [BATADV_ATTR_GW_BANDWIDTH_UP] = { .type = NLA_U32 }, [BATADV_ATTR_GW_MODE] = { .type = NLA_U8 }, [BATADV_ATTR_GW_SEL_CLASS] = { .type = NLA_U32 }, [BATADV_ATTR_HOP_PENALTY] = { .type = NLA_U8 }, [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_MULTICAST_FANOUT] = { .type = NLA_U32 }, [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_THROUGHPUT_OVERRIDE] = { .type = NLA_U32 }, }; /** * batadv_netlink_get_ifindex() - Extract an interface index from a message * @nlh: Message header * @attrtype: Attribute which holds an interface index * * Return: interface index, or 0. */ static int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) { struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype); return (attr && nla_len(attr) == sizeof(u32)) ? nla_get_u32(attr) : 0; } /** * batadv_netlink_mesh_fill_ap_isolation() - Add ap_isolation softif attribute * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_mesh_fill_ap_isolation(struct sk_buff *msg, struct batadv_priv *bat_priv) { struct batadv_softif_vlan *vlan; u8 ap_isolation; vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (!vlan) return 0; ap_isolation = atomic_read(&vlan->ap_isolation); batadv_softif_vlan_put(vlan); return nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED, !!ap_isolation); } /** * batadv_netlink_set_mesh_ap_isolation() - Set ap_isolation from genl msg * @attr: parsed BATADV_ATTR_AP_ISOLATION_ENABLED attribute * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_mesh_ap_isolation(struct nlattr *attr, struct batadv_priv *bat_priv) { struct batadv_softif_vlan *vlan; vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (!vlan) return -ENOENT; atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr)); batadv_softif_vlan_put(vlan); return 0; } /** * batadv_netlink_mesh_fill() - Fill message with mesh attributes * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the soft interface information * @cmd: type of message to generate * @portid: Port making netlink request * @seq: sequence number for message * @flags: Additional flags for message * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_mesh_fill(struct sk_buff *msg, struct batadv_priv *bat_priv, enum batadv_nl_commands cmd, u32 portid, u32 seq, int flags) { struct net_device *soft_iface = bat_priv->soft_iface; struct batadv_hard_iface *primary_if = NULL; struct net_device *hard_iface; void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) || nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_priv->algo_ops->name) || nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) || nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) || nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN, soft_iface->dev_addr) || nla_put_u8(msg, BATADV_ATTR_TT_TTVN, (u8)atomic_read(&bat_priv->tt.vn))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_BLA if (nla_put_u16(msg, BATADV_ATTR_BLA_CRC, ntohs(bat_priv->bla.claim_dest.group))) goto nla_put_failure; #endif if (batadv_mcast_mesh_info_put(msg, bat_priv)) goto nla_put_failure; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { hard_iface = primary_if->net_dev; if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hard_iface->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, hard_iface->name) || nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, hard_iface->dev_addr)) goto nla_put_failure; } if (nla_put_u8(msg, BATADV_ATTR_AGGREGATED_OGMS_ENABLED, !!atomic_read(&bat_priv->aggregated_ogms))) goto nla_put_failure; if (batadv_netlink_mesh_fill_ap_isolation(msg, bat_priv)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MARK, bat_priv->isolation_mark)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MASK, bat_priv->isolation_mark_mask)) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_BONDING_ENABLED, !!atomic_read(&bat_priv->bonding))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_BLA if (nla_put_u8(msg, BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED, !!atomic_read(&bat_priv->bridge_loop_avoidance))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_BLA */ #ifdef CONFIG_BATMAN_ADV_DAT if (nla_put_u8(msg, BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED, !!atomic_read(&bat_priv->distributed_arp_table))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_DAT */ if (nla_put_u8(msg, BATADV_ATTR_FRAGMENTATION_ENABLED, !!atomic_read(&bat_priv->fragmentation))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_DOWN, atomic_read(&bat_priv->gw.bandwidth_down))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_UP, atomic_read(&bat_priv->gw.bandwidth_up))) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_GW_MODE, atomic_read(&bat_priv->gw.mode))) goto nla_put_failure; if (bat_priv->algo_ops->gw.get_best_gw_node && bat_priv->algo_ops->gw.is_eligible) { /* GW selection class is not available if the routing algorithm * in use does not implement the GW API */ if (nla_put_u32(msg, BATADV_ATTR_GW_SEL_CLASS, atomic_read(&bat_priv->gw.sel_class))) goto nla_put_failure; } if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, atomic_read(&bat_priv->hop_penalty))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_DEBUG if (nla_put_u32(msg, BATADV_ATTR_LOG_LEVEL, atomic_read(&bat_priv->log_level))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_DEBUG */ #ifdef CONFIG_BATMAN_ADV_MCAST if (nla_put_u8(msg, BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED, !atomic_read(&bat_priv->multicast_mode))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_MULTICAST_FANOUT, atomic_read(&bat_priv->multicast_fanout))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_MCAST */ #ifdef CONFIG_BATMAN_ADV_NC if (nla_put_u8(msg, BATADV_ATTR_NETWORK_CODING_ENABLED, !!atomic_read(&bat_priv->network_coding))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_NC */ if (nla_put_u32(msg, BATADV_ATTR_ORIG_INTERVAL, atomic_read(&bat_priv->orig_interval))) goto nla_put_failure; batadv_hardif_put(primary_if); genlmsg_end(msg, hdr); return 0; nla_put_failure: batadv_hardif_put(primary_if); genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_netlink_notify_mesh() - send softif attributes to listener * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success, < 0 on error */ static int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_SET_MESH, 0, 0, 0); if (ret < 0) { nlmsg_free(msg); return ret; } genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->soft_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; } /** * batadv_netlink_get_mesh() - Get softif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_get_mesh(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_GET_MESH, info->snd_portid, info->snd_seq, 0); if (ret < 0) { nlmsg_free(msg); return ret; } ret = genlmsg_reply(msg, info); return ret; } /** * batadv_netlink_set_mesh() - Set softif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; struct nlattr *attr; if (info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]) { attr = info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]; atomic_set(&bat_priv->aggregated_ogms, !!nla_get_u8(attr)); } if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) { attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]; batadv_netlink_set_mesh_ap_isolation(attr, bat_priv); } if (info->attrs[BATADV_ATTR_ISOLATION_MARK]) { attr = info->attrs[BATADV_ATTR_ISOLATION_MARK]; bat_priv->isolation_mark = nla_get_u32(attr); } if (info->attrs[BATADV_ATTR_ISOLATION_MASK]) { attr = info->attrs[BATADV_ATTR_ISOLATION_MASK]; bat_priv->isolation_mark_mask = nla_get_u32(attr); } if (info->attrs[BATADV_ATTR_BONDING_ENABLED]) { attr = info->attrs[BATADV_ATTR_BONDING_ENABLED]; atomic_set(&bat_priv->bonding, !!nla_get_u8(attr)); } #ifdef CONFIG_BATMAN_ADV_BLA if (info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]) { attr = info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]; atomic_set(&bat_priv->bridge_loop_avoidance, !!nla_get_u8(attr)); batadv_bla_status_update(bat_priv->soft_iface); } #endif /* CONFIG_BATMAN_ADV_BLA */ #ifdef CONFIG_BATMAN_ADV_DAT if (info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]) { attr = info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]; atomic_set(&bat_priv->distributed_arp_table, !!nla_get_u8(attr)); batadv_dat_status_update(bat_priv->soft_iface); } #endif /* CONFIG_BATMAN_ADV_DAT */ if (info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]) { attr = info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]; atomic_set(&bat_priv->fragmentation, !!nla_get_u8(attr)); rtnl_lock(); batadv_update_min_mtu(bat_priv->soft_iface); rtnl_unlock(); } if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]) { attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]; atomic_set(&bat_priv->gw.bandwidth_down, nla_get_u32(attr)); batadv_gw_tvlv_container_update(bat_priv); } if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP]) { attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP]; atomic_set(&bat_priv->gw.bandwidth_up, nla_get_u32(attr)); batadv_gw_tvlv_container_update(bat_priv); } if (info->attrs[BATADV_ATTR_GW_MODE]) { u8 gw_mode; attr = info->attrs[BATADV_ATTR_GW_MODE]; gw_mode = nla_get_u8(attr); if (gw_mode <= BATADV_GW_MODE_SERVER) { /* Invoking batadv_gw_reselect() is not enough to really * de-select the current GW. It will only instruct the * gateway client code to perform a re-election the next * time that this is needed. * * When gw client mode is being switched off the current * GW must be de-selected explicitly otherwise no GW_ADD * uevent is thrown on client mode re-activation. This * is operation is performed in * batadv_gw_check_client_stop(). */ batadv_gw_reselect(bat_priv); /* always call batadv_gw_check_client_stop() before * changing the gateway state */ batadv_gw_check_client_stop(bat_priv); atomic_set(&bat_priv->gw.mode, gw_mode); batadv_gw_tvlv_container_update(bat_priv); } } if (info->attrs[BATADV_ATTR_GW_SEL_CLASS] && bat_priv->algo_ops->gw.get_best_gw_node && bat_priv->algo_ops->gw.is_eligible) { /* setting the GW selection class is allowed only if the routing * algorithm in use implements the GW API */ u32 sel_class_max = bat_priv->algo_ops->gw.sel_class_max; u32 sel_class; attr = info->attrs[BATADV_ATTR_GW_SEL_CLASS]; sel_class = nla_get_u32(attr); if (sel_class >= 1 && sel_class <= sel_class_max) { atomic_set(&bat_priv->gw.sel_class, sel_class); batadv_gw_reselect(bat_priv); } } if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; atomic_set(&bat_priv->hop_penalty, nla_get_u8(attr)); } #ifdef CONFIG_BATMAN_ADV_DEBUG if (info->attrs[BATADV_ATTR_LOG_LEVEL]) { attr = info->attrs[BATADV_ATTR_LOG_LEVEL]; atomic_set(&bat_priv->log_level, nla_get_u32(attr) & BATADV_DBG_ALL); } #endif /* CONFIG_BATMAN_ADV_DEBUG */ #ifdef CONFIG_BATMAN_ADV_MCAST if (info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]) { attr = info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]; atomic_set(&bat_priv->multicast_mode, !nla_get_u8(attr)); } if (info->attrs[BATADV_ATTR_MULTICAST_FANOUT]) { attr = info->attrs[BATADV_ATTR_MULTICAST_FANOUT]; atomic_set(&bat_priv->multicast_fanout, nla_get_u32(attr)); } #endif /* CONFIG_BATMAN_ADV_MCAST */ #ifdef CONFIG_BATMAN_ADV_NC if (info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]) { attr = info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]; atomic_set(&bat_priv->network_coding, !!nla_get_u8(attr)); batadv_nc_status_update(bat_priv->soft_iface); } #endif /* CONFIG_BATMAN_ADV_NC */ if (info->attrs[BATADV_ATTR_ORIG_INTERVAL]) { u32 orig_interval; attr = info->attrs[BATADV_ATTR_ORIG_INTERVAL]; orig_interval = nla_get_u32(attr); orig_interval = min_t(u32, orig_interval, INT_MAX); orig_interval = max_t(u32, orig_interval, 2 * BATADV_JITTER); atomic_set(&bat_priv->orig_interval, orig_interval); } batadv_netlink_notify_mesh(bat_priv); return 0; } /** * batadv_netlink_tp_meter_put() - Fill information of started tp_meter session * @msg: netlink message to be sent back * @cookie: tp meter session cookie * * Return: 0 on success, < 0 on error */ static int batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) { if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie)) return -ENOBUFS; return 0; } /** * batadv_netlink_tpmeter_notify() - send tp_meter result via netlink to client * @bat_priv: the bat priv with all the soft interface information * @dst: destination of tp_meter session * @result: reason for tp meter session stop * @test_time: total time of the tp_meter session * @total_bytes: bytes acked to the receiver * @cookie: cookie of tp_meter session * * Return: 0 on success, < 0 on error */ int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, u8 result, u32 test_time, u64 total_bytes, u32 cookie) { struct sk_buff *msg; void *hdr; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &batadv_netlink_family, 0, BATADV_CMD_TP_METER); if (!hdr) { ret = -ENOBUFS; goto err_genlmsg; } if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_TPMETER_TEST_TIME, test_time)) goto nla_put_failure; if (nla_put_u64_64bit(msg, BATADV_ATTR_TPMETER_BYTES, total_bytes, BATADV_ATTR_PAD)) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_TPMETER_RESULT, result)) goto nla_put_failure; if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, dst)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->soft_iface), msg, 0, BATADV_NL_MCGRP_TPMETER, GFP_KERNEL); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); ret = -EMSGSIZE; err_genlmsg: nlmsg_free(msg); return ret; } /** * batadv_netlink_tp_meter_start() - Start a new tp_meter session * @skb: received netlink message * @info: receiver information * * Return: 0 on success, < 0 on error */ static int batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg = NULL; u32 test_length; void *msg_head; u32 cookie; u8 *dst; int ret; if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS]) return -EINVAL; if (!info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]) return -EINVAL; dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]); test_length = nla_get_u32(info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto out; } msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq, &batadv_netlink_family, 0, BATADV_CMD_TP_METER); if (!msg_head) { ret = -ENOBUFS; goto out; } batadv_tp_start(bat_priv, dst, test_length, &cookie); ret = batadv_netlink_tp_meter_put(msg, cookie); out: if (ret) { if (msg) nlmsg_free(msg); return ret; } genlmsg_end(msg, msg_head); return genlmsg_reply(msg, info); } /** * batadv_netlink_tp_meter_cancel() - Cancel a running tp_meter session * @skb: received netlink message * @info: receiver information * * Return: 0 on success, < 0 on error */ static int batadv_netlink_tp_meter_cancel(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; u8 *dst; int ret = 0; if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS]) return -EINVAL; dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]); batadv_tp_stop(bat_priv, dst, BATADV_TP_REASON_CANCEL); return ret; } /** * batadv_netlink_hardif_fill() - Fill message with hardif attributes * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the soft interface information * @hard_iface: hard interface which was modified * @cmd: type of message to generate * @portid: Port making netlink request * @seq: sequence number for message * @flags: Additional flags for message * @cb: Control block containing additional options * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_hardif_fill(struct sk_buff *msg, struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface, enum batadv_nl_commands cmd, u32 portid, u32 seq, int flags, struct netlink_callback *cb) { struct net_device *net_dev = hard_iface->net_dev; void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); if (!hdr) return -ENOBUFS; if (cb) genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, bat_priv->soft_iface->ifindex)) goto nla_put_failure; if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, bat_priv->soft_iface->name)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, net_dev->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, net_dev->name) || nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, net_dev->dev_addr)) goto nla_put_failure; if (hard_iface->if_status == BATADV_IF_ACTIVE) { if (nla_put_flag(msg, BATADV_ATTR_ACTIVE)) goto nla_put_failure; } if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, atomic_read(&hard_iface->hop_penalty))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_BATMAN_V if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL, atomic_read(&hard_iface->bat_v.elp_interval))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT_OVERRIDE, atomic_read(&hard_iface->bat_v.throughput_override))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_BATMAN_V */ genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_netlink_notify_hardif() - send hardif attributes to listener * @bat_priv: the bat priv with all the soft interface information * @hard_iface: hard interface which was modified * * Return: 0 on success, < 0 on error */ static int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, BATADV_CMD_SET_HARDIF, 0, 0, 0, NULL); if (ret < 0) { nlmsg_free(msg); return ret; } genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->soft_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; } /** * batadv_netlink_cmd_get_hardif() - Get hardif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_cmd_get_hardif(struct sk_buff *skb, struct genl_info *info) { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, BATADV_CMD_GET_HARDIF, info->snd_portid, info->snd_seq, 0, NULL); if (ret < 0) { nlmsg_free(msg); return ret; } ret = genlmsg_reply(msg, info); return ret; } /** * batadv_netlink_set_hardif() - Set hardif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_hardif(struct sk_buff *skb, struct genl_info *info) { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct nlattr *attr; if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; atomic_set(&hard_iface->hop_penalty, nla_get_u8(attr)); } #ifdef CONFIG_BATMAN_ADV_BATMAN_V if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) { attr = info->attrs[BATADV_ATTR_ELP_INTERVAL]; atomic_set(&hard_iface->bat_v.elp_interval, nla_get_u32(attr)); } if (info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE]) { attr = info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE]; atomic_set(&hard_iface->bat_v.throughput_override, nla_get_u32(attr)); } #endif /* CONFIG_BATMAN_ADV_BATMAN_V */ batadv_netlink_notify_hardif(bat_priv, hard_iface); return 0; } /** * batadv_netlink_dump_hardif() - Dump all hard interface into a messages * @msg: Netlink message to dump into * @cb: Parameters from query * * Return: error code, or length of reply message on success */ static int batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb) { struct net_device *soft_iface; struct batadv_hard_iface *hard_iface; struct batadv_priv *bat_priv; int portid = NETLINK_CB(cb->skb).portid; int skip = cb->args[0]; int i = 0; soft_iface = batadv_netlink_get_softif(cb); if (IS_ERR(soft_iface)) return PTR_ERR(soft_iface); bat_priv = netdev_priv(soft_iface); rtnl_lock(); cb->seq = batadv_hardif_generation << 1 | 1; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != soft_iface) continue; if (i++ < skip) continue; if (batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, BATADV_CMD_GET_HARDIF, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb)) { i--; break; } } rtnl_unlock(); dev_put(soft_iface); cb->args[0] = i; return msg->len; } /** * batadv_netlink_vlan_fill() - Fill message with vlan attributes * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the soft interface information * @vlan: vlan which was modified * @cmd: type of message to generate * @portid: Port making netlink request * @seq: sequence number for message * @flags: Additional flags for message * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_vlan_fill(struct sk_buff *msg, struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan, enum batadv_nl_commands cmd, u32 portid, u32 seq, int flags) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, bat_priv->soft_iface->ifindex)) goto nla_put_failure; if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, bat_priv->soft_iface->name)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK)) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED, !!atomic_read(&vlan->ap_isolation))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_netlink_notify_vlan() - send vlan attributes to listener * @bat_priv: the bat priv with all the soft interface information * @vlan: vlan which was modified * * Return: 0 on success, < 0 on error */ static int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan, BATADV_CMD_SET_VLAN, 0, 0, 0); if (ret < 0) { nlmsg_free(msg); return ret; } genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->soft_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; } /** * batadv_netlink_get_vlan() - Get vlan attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_get_vlan(struct sk_buff *skb, struct genl_info *info) { struct batadv_softif_vlan *vlan = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan, BATADV_CMD_GET_VLAN, info->snd_portid, info->snd_seq, 0); if (ret < 0) { nlmsg_free(msg); return ret; } ret = genlmsg_reply(msg, info); return ret; } /** * batadv_netlink_set_vlan() - Get vlan attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info) { struct batadv_softif_vlan *vlan = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct nlattr *attr; if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) { attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]; atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr)); } batadv_netlink_notify_vlan(bat_priv, vlan); return 0; } /** * batadv_netlink_get_softif_from_ifindex() - Get soft-iface from ifindex * @net: the applicable net namespace * @ifindex: index of the soft interface * * Return: Pointer to soft interface (with increased refcnt) on success, error * pointer on error */ static struct net_device * batadv_netlink_get_softif_from_ifindex(struct net *net, int ifindex) { struct net_device *soft_iface; soft_iface = dev_get_by_index(net, ifindex); if (!soft_iface) return ERR_PTR(-ENODEV); if (!batadv_softif_is_valid(soft_iface)) goto err_put_softif; return soft_iface; err_put_softif: dev_put(soft_iface); return ERR_PTR(-EINVAL); } /** * batadv_netlink_get_softif_from_info() - Get soft-iface from genl attributes * @net: the applicable net namespace * @info: receiver information * * Return: Pointer to soft interface (with increased refcnt) on success, error * pointer on error */ static struct net_device * batadv_netlink_get_softif_from_info(struct net *net, struct genl_info *info) { int ifindex; if (!info->attrs[BATADV_ATTR_MESH_IFINDEX]) return ERR_PTR(-EINVAL); ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]); return batadv_netlink_get_softif_from_ifindex(net, ifindex); } /** * batadv_netlink_get_softif() - Retrieve soft interface from netlink callback * @cb: callback structure containing arguments * * Return: Pointer to soft interface (with increased refcnt) on success, error * pointer on error */ struct net_device *batadv_netlink_get_softif(struct netlink_callback *cb) { int ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); if (!ifindex) return ERR_PTR(-ENONET); return batadv_netlink_get_softif_from_ifindex(sock_net(cb->skb->sk), ifindex); } /** * batadv_netlink_get_hardif_from_ifindex() - Get hard-iface from ifindex * @bat_priv: the bat priv with all the soft interface information * @net: the applicable net namespace * @ifindex: index of the hard interface * * Return: Pointer to hard interface (with increased refcnt) on success, error * pointer on error */ static struct batadv_hard_iface * batadv_netlink_get_hardif_from_ifindex(struct batadv_priv *bat_priv, struct net *net, int ifindex) { struct batadv_hard_iface *hard_iface; struct net_device *hard_dev; hard_dev = dev_get_by_index(net, ifindex); if (!hard_dev) return ERR_PTR(-ENODEV); hard_iface = batadv_hardif_get_by_netdev(hard_dev); if (!hard_iface) goto err_put_harddev; if (hard_iface->soft_iface != bat_priv->soft_iface) goto err_put_hardif; /* hard_dev is referenced by hard_iface and not needed here */ dev_put(hard_dev); return hard_iface; err_put_hardif: batadv_hardif_put(hard_iface); err_put_harddev: dev_put(hard_dev); return ERR_PTR(-EINVAL); } /** * batadv_netlink_get_hardif_from_info() - Get hard-iface from genl attributes * @bat_priv: the bat priv with all the soft interface information * @net: the applicable net namespace * @info: receiver information * * Return: Pointer to hard interface (with increased refcnt) on success, error * pointer on error */ static struct batadv_hard_iface * batadv_netlink_get_hardif_from_info(struct batadv_priv *bat_priv, struct net *net, struct genl_info *info) { int ifindex; if (!info->attrs[BATADV_ATTR_HARD_IFINDEX]) return ERR_PTR(-EINVAL); ifindex = nla_get_u32(info->attrs[BATADV_ATTR_HARD_IFINDEX]); return batadv_netlink_get_hardif_from_ifindex(bat_priv, net, ifindex); } /** * batadv_netlink_get_hardif() - Retrieve hard interface from netlink callback * @bat_priv: the bat priv with all the soft interface information * @cb: callback structure containing arguments * * Return: Pointer to hard interface (with increased refcnt) on success, error * pointer on error */ struct batadv_hard_iface * batadv_netlink_get_hardif(struct batadv_priv *bat_priv, struct netlink_callback *cb) { int ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_HARD_IFINDEX); if (!ifindex) return ERR_PTR(-ENONET); return batadv_netlink_get_hardif_from_ifindex(bat_priv, sock_net(cb->skb->sk), ifindex); } /** * batadv_get_vlan_from_info() - Retrieve vlan from genl attributes * @bat_priv: the bat priv with all the soft interface information * @net: the applicable net namespace * @info: receiver information * * Return: Pointer to vlan on success (with increased refcnt), error pointer * on error */ static struct batadv_softif_vlan * batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net, struct genl_info *info) { struct batadv_softif_vlan *vlan; u16 vid; if (!info->attrs[BATADV_ATTR_VLANID]) return ERR_PTR(-EINVAL); vid = nla_get_u16(info->attrs[BATADV_ATTR_VLANID]); vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return ERR_PTR(-ENOENT); return vlan; } /** * batadv_pre_doit() - Prepare batman-adv genl doit request * @ops: requested netlink operation * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct batadv_hard_iface *hard_iface; struct batadv_priv *bat_priv = NULL; struct batadv_softif_vlan *vlan; struct net_device *soft_iface; u8 user_ptr1_flags; u8 mesh_dep_flags; int ret; user_ptr1_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN; if (WARN_ON(hweight8(ops->internal_flags & user_ptr1_flags) > 1)) return -EINVAL; mesh_dep_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN; if (WARN_ON((ops->internal_flags & mesh_dep_flags) && (~ops->internal_flags & BATADV_FLAG_NEED_MESH))) return -EINVAL; if (ops->internal_flags & BATADV_FLAG_NEED_MESH) { soft_iface = batadv_netlink_get_softif_from_info(net, info); if (IS_ERR(soft_iface)) return PTR_ERR(soft_iface); bat_priv = netdev_priv(soft_iface); info->user_ptr[0] = bat_priv; } if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF) { hard_iface = batadv_netlink_get_hardif_from_info(bat_priv, net, info); if (IS_ERR(hard_iface)) { ret = PTR_ERR(hard_iface); goto err_put_softif; } info->user_ptr[1] = hard_iface; } if (ops->internal_flags & BATADV_FLAG_NEED_VLAN) { vlan = batadv_get_vlan_from_info(bat_priv, net, info); if (IS_ERR(vlan)) { ret = PTR_ERR(vlan); goto err_put_softif; } info->user_ptr[1] = vlan; } return 0; err_put_softif: if (bat_priv) dev_put(bat_priv->soft_iface); return ret; } /** * batadv_post_doit() - End batman-adv genl doit request * @ops: requested netlink operation * @skb: Netlink message with request data * @info: receiver information */ static void batadv_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct batadv_hard_iface *hard_iface; struct batadv_softif_vlan *vlan; struct batadv_priv *bat_priv; if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF && info->user_ptr[1]) { hard_iface = info->user_ptr[1]; batadv_hardif_put(hard_iface); } if (ops->internal_flags & BATADV_FLAG_NEED_VLAN && info->user_ptr[1]) { vlan = info->user_ptr[1]; batadv_softif_vlan_put(vlan); } if (ops->internal_flags & BATADV_FLAG_NEED_MESH && info->user_ptr[0]) { bat_priv = info->user_ptr[0]; dev_put(bat_priv->soft_iface); } } static const struct genl_small_ops batadv_netlink_ops[] = { { .cmd = BATADV_CMD_GET_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .doit = batadv_netlink_get_mesh, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_TP_METER, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_tp_meter_start, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_TP_METER_CANCEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_tp_meter_cancel, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_GET_ROUTING_ALGOS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_algo_dump, }, { .cmd = BATADV_CMD_GET_HARDIF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .dumpit = batadv_netlink_dump_hardif, .doit = batadv_netlink_cmd_get_hardif, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_HARDIF, }, { .cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_tt_local_dump, }, { .cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_tt_global_dump, }, { .cmd = BATADV_CMD_GET_ORIGINATORS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_orig_dump, }, { .cmd = BATADV_CMD_GET_NEIGHBORS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_hardif_neigh_dump, }, { .cmd = BATADV_CMD_GET_GATEWAYS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_gw_dump, }, { .cmd = BATADV_CMD_GET_BLA_CLAIM, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_bla_claim_dump, }, { .cmd = BATADV_CMD_GET_BLA_BACKBONE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_bla_backbone_dump, }, { .cmd = BATADV_CMD_GET_DAT_CACHE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_dat_cache_dump, }, { .cmd = BATADV_CMD_GET_MCAST_FLAGS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_mcast_flags_dump, }, { .cmd = BATADV_CMD_SET_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_set_mesh, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_SET_HARDIF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_set_hardif, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_HARDIF, }, { .cmd = BATADV_CMD_GET_VLAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .doit = batadv_netlink_get_vlan, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_VLAN, }, { .cmd = BATADV_CMD_SET_VLAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_set_vlan, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_VLAN, }, }; struct genl_family batadv_netlink_family __ro_after_init = { .hdrsize = 0, .name = BATADV_NL_NAME, .version = 1, .maxattr = BATADV_ATTR_MAX, .policy = batadv_netlink_policy, .netnsok = true, .pre_doit = batadv_pre_doit, .post_doit = batadv_post_doit, .module = THIS_MODULE, .small_ops = batadv_netlink_ops, .n_small_ops = ARRAY_SIZE(batadv_netlink_ops), .resv_start_op = BATADV_CMD_SET_VLAN + 1, .mcgrps = batadv_netlink_mcgrps, .n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps), }; /** * batadv_netlink_register() - register batadv genl netlink family */ void __init batadv_netlink_register(void) { int ret; ret = genl_register_family(&batadv_netlink_family); if (ret) pr_warn("unable to register netlink family"); } /** * batadv_netlink_unregister() - unregister batadv genl netlink family */ void batadv_netlink_unregister(void) { genl_unregister_family(&batadv_netlink_family); }
12 12 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "peerlookup.h" #include "peer.h" #include "noise.h" static struct hlist_head *pubkey_bucket(struct pubkey_hashtable *table, const u8 pubkey[NOISE_PUBLIC_KEY_LEN]) { /* siphash gives us a secure 64bit number based on a random key. Since * the bits are uniformly distributed, we can then mask off to get the * bits we need. */ const u64 hash = siphash(pubkey, NOISE_PUBLIC_KEY_LEN, &table->key); return &table->hashtable[hash & (HASH_SIZE(table->hashtable) - 1)]; } struct pubkey_hashtable *wg_pubkey_hashtable_alloc(void) { struct pubkey_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; get_random_bytes(&table->key, sizeof(table->key)); hash_init(table->hashtable); mutex_init(&table->lock); return table; } void wg_pubkey_hashtable_add(struct pubkey_hashtable *table, struct wg_peer *peer) { mutex_lock(&table->lock); hlist_add_head_rcu(&peer->pubkey_hash, pubkey_bucket(table, peer->handshake.remote_static)); mutex_unlock(&table->lock); } void wg_pubkey_hashtable_remove(struct pubkey_hashtable *table, struct wg_peer *peer) { mutex_lock(&table->lock); hlist_del_init_rcu(&peer->pubkey_hash); mutex_unlock(&table->lock); } /* Returns a strong reference to a peer */ struct wg_peer * wg_pubkey_hashtable_lookup(struct pubkey_hashtable *table, const u8 pubkey[NOISE_PUBLIC_KEY_LEN]) { struct wg_peer *iter_peer, *peer = NULL; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(iter_peer, pubkey_bucket(table, pubkey), pubkey_hash) { if (!memcmp(pubkey, iter_peer->handshake.remote_static, NOISE_PUBLIC_KEY_LEN)) { peer = iter_peer; break; } } peer = wg_peer_get_maybe_zero(peer); rcu_read_unlock_bh(); return peer; } static struct hlist_head *index_bucket(struct index_hashtable *table, const __le32 index) { /* Since the indices are random and thus all bits are uniformly * distributed, we can find its bucket simply by masking. */ return &table->hashtable[(__force u32)index & (HASH_SIZE(table->hashtable) - 1)]; } struct index_hashtable *wg_index_hashtable_alloc(void) { struct index_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; hash_init(table->hashtable); spin_lock_init(&table->lock); return table; } /* At the moment, we limit ourselves to 2^20 total peers, which generally might * amount to 2^20*3 items in this hashtable. The algorithm below works by * picking a random number and testing it. We can see that these limits mean we * usually succeed pretty quickly: * * >>> def calculation(tries, size): * ... return (size / 2**32)**(tries - 1) * (1 - (size / 2**32)) * ... * >>> calculation(1, 2**20 * 3) * 0.999267578125 * >>> calculation(2, 2**20 * 3) * 0.0007318854331970215 * >>> calculation(3, 2**20 * 3) * 5.360489012673497e-07 * >>> calculation(4, 2**20 * 3) * 3.9261394135792216e-10 * * At the moment, we don't do any masking, so this algorithm isn't exactly * constant time in either the random guessing or in the hash list lookup. We * could require a minimum of 3 tries, which would successfully mask the * guessing. this would not, however, help with the growing hash lengths, which * is another thing to consider moving forward. */ __le32 wg_index_hashtable_insert(struct index_hashtable *table, struct index_hashtable_entry *entry) { struct index_hashtable_entry *existing_entry; spin_lock_bh(&table->lock); hlist_del_init_rcu(&entry->index_hash); spin_unlock_bh(&table->lock); rcu_read_lock_bh(); search_unused_slot: /* First we try to find an unused slot, randomly, while unlocked. */ entry->index = (__force __le32)get_random_u32(); hlist_for_each_entry_rcu_bh(existing_entry, index_bucket(table, entry->index), index_hash) { if (existing_entry->index == entry->index) /* If it's already in use, we continue searching. */ goto search_unused_slot; } /* Once we've found an unused slot, we lock it, and then double-check * that nobody else stole it from us. */ spin_lock_bh(&table->lock); hlist_for_each_entry_rcu_bh(existing_entry, index_bucket(table, entry->index), index_hash) { if (existing_entry->index == entry->index) { spin_unlock_bh(&table->lock); /* If it was stolen, we start over. */ goto search_unused_slot; } } /* Otherwise, we know we have it exclusively (since we're locked), * so we insert. */ hlist_add_head_rcu(&entry->index_hash, index_bucket(table, entry->index)); spin_unlock_bh(&table->lock); rcu_read_unlock_bh(); return entry->index; } bool wg_index_hashtable_replace(struct index_hashtable *table, struct index_hashtable_entry *old, struct index_hashtable_entry *new) { bool ret; spin_lock_bh(&table->lock); ret = !hlist_unhashed(&old->index_hash); if (unlikely(!ret)) goto out; new->index = old->index; hlist_replace_rcu(&old->index_hash, &new->index_hash); /* Calling init here NULLs out index_hash, and in fact after this * function returns, it's theoretically possible for this to get * reinserted elsewhere. That means the RCU lookup below might either * terminate early or jump between buckets, in which case the packet * simply gets dropped, which isn't terrible. */ INIT_HLIST_NODE(&old->index_hash); out: spin_unlock_bh(&table->lock); return ret; } void wg_index_hashtable_remove(struct index_hashtable *table, struct index_hashtable_entry *entry) { spin_lock_bh(&table->lock); hlist_del_init_rcu(&entry->index_hash); spin_unlock_bh(&table->lock); } /* Returns a strong reference to a entry->peer */ struct index_hashtable_entry * wg_index_hashtable_lookup(struct index_hashtable *table, const enum index_hashtable_type type_mask, const __le32 index, struct wg_peer **peer) { struct index_hashtable_entry *iter_entry, *entry = NULL; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(iter_entry, index_bucket(table, index), index_hash) { if (iter_entry->index == index) { if (likely(iter_entry->type & type_mask)) entry = iter_entry; break; } } if (likely(entry)) { entry->peer = wg_peer_get_maybe_zero(entry->peer); if (likely(entry->peer)) *peer = entry->peer; else entry = NULL; } rcu_read_unlock_bh(); return entry; }
4 4 4 4 4 4 1 1 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool_netlink.h> #include <net/udp_tunnel.h> #include <net/vxlan.h> #include "bitset.h" #include "common.h" #include "netlink.h" const struct nla_policy ethnl_tunnel_info_get_policy[] = { [ETHTOOL_A_TUNNEL_INFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN == ilog2(UDP_TUNNEL_TYPE_VXLAN)); static_assert(ETHTOOL_UDP_TUNNEL_TYPE_GENEVE == ilog2(UDP_TUNNEL_TYPE_GENEVE)); static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE == ilog2(UDP_TUNNEL_TYPE_VXLAN_GPE)); static ssize_t ethnl_udp_table_reply_size(unsigned int types, bool compact) { ssize_t size; size = ethnl_bitset32_size(&types, NULL, __ETHTOOL_UDP_TUNNEL_TYPE_CNT, udp_tunnel_type_names, compact); if (size < 0) return size; return size + nla_total_size(0) + /* _UDP_TABLE */ nla_total_size(sizeof(u32)); /* _UDP_TABLE_SIZE */ } static ssize_t ethnl_tunnel_info_reply_size(const struct ethnl_req_info *req_base, struct netlink_ext_ack *extack) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct udp_tunnel_nic_info *info; unsigned int i; ssize_t ret; size_t size; info = req_base->dev->udp_tunnel_nic_info; if (!info) { NL_SET_ERR_MSG(extack, "device does not report tunnel offload info"); return -EOPNOTSUPP; } size = nla_total_size(0); /* _INFO_UDP_PORTS */ for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) break; ret = ethnl_udp_table_reply_size(info->tables[i].tunnel_types, compact); if (ret < 0) return ret; size += ret; size += udp_tunnel_nic_dump_size(req_base->dev, i); } if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) { ret = ethnl_udp_table_reply_size(0, compact); if (ret < 0) return ret; size += ret; size += nla_total_size(0) + /* _TABLE_ENTRY */ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ } return size; } static int ethnl_tunnel_info_fill_reply(const struct ethnl_req_info *req_base, struct sk_buff *skb) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct udp_tunnel_nic_info *info; struct nlattr *ports, *table, *entry; unsigned int i; info = req_base->dev->udp_tunnel_nic_info; if (!info) return -EOPNOTSUPP; ports = nla_nest_start(skb, ETHTOOL_A_TUNNEL_INFO_UDP_PORTS); if (!ports) return -EMSGSIZE; for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) break; table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE); if (!table) goto err_cancel_ports; if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, info->tables[i].n_entries)) goto err_cancel_table; if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, &info->tables[i].tunnel_types, NULL, __ETHTOOL_UDP_TUNNEL_TYPE_CNT, udp_tunnel_type_names, compact)) goto err_cancel_table; if (udp_tunnel_nic_dump_write(req_base->dev, i, skb)) goto err_cancel_table; nla_nest_end(skb, table); } if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) { u32 zero = 0; table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE); if (!table) goto err_cancel_ports; if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, 1)) goto err_cancel_table; if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, &zero, NULL, __ETHTOOL_UDP_TUNNEL_TYPE_CNT, udp_tunnel_type_names, compact)) goto err_cancel_table; entry = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); if (!entry) goto err_cancel_entry; if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, htons(IANA_VXLAN_UDP_PORT)) || nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, ilog2(UDP_TUNNEL_TYPE_VXLAN))) goto err_cancel_entry; nla_nest_end(skb, entry); nla_nest_end(skb, table); } nla_nest_end(skb, ports); return 0; err_cancel_entry: nla_nest_cancel(skb, entry); err_cancel_table: nla_nest_cancel(skb, table); err_cancel_ports: nla_nest_cancel(skb, ports); return -EMSGSIZE; } int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info) { struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; struct sk_buff *rskb; void *reply_payload; int reply_len; int ret; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_TUNNEL_INFO_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; rtnl_lock(); ret = ethnl_tunnel_info_reply_size(&req_info, info->extack); if (ret < 0) goto err_unlock_rtnl; reply_len = ret + ethnl_reply_header_size(); rskb = ethnl_reply_init(reply_len, req_info.dev, ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, ETHTOOL_A_TUNNEL_INFO_HEADER, info, &reply_payload); if (!rskb) { ret = -ENOMEM; goto err_unlock_rtnl; } ret = ethnl_tunnel_info_fill_reply(&req_info, rskb); if (ret) goto err_free_msg; rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); genlmsg_end(rskb, reply_payload); return genlmsg_reply(rskb, info); err_free_msg: nlmsg_free(rskb); err_unlock_rtnl: rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; } struct ethnl_tunnel_info_dump_ctx { struct ethnl_req_info req_info; unsigned long ifindex; }; int ethnl_tunnel_info_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->info.attrs; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); memset(ctx, 0, sizeof(*ctx)); ret = ethnl_parse_header_dev_get(&ctx->req_info, tb[ETHTOOL_A_TUNNEL_INFO_HEADER], sock_net(cb->skb->sk), cb->extack, false); if (ctx->req_info.dev) { ethnl_parse_header_dev_put(&ctx->req_info); ctx->req_info.dev = NULL; } return ret; } int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct net_device *dev; int ret = 0; void *ehdr; rtnl_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); if (!ehdr) { ret = -EMSGSIZE; break; } ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TUNNEL_INFO_HEADER); if (ret < 0) { genlmsg_cancel(skb, ehdr); break; } ctx->req_info.dev = dev; ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); ctx->req_info.dev = NULL; if (ret < 0) { genlmsg_cancel(skb, ehdr); if (ret == -EOPNOTSUPP) continue; break; } genlmsg_end(skb, ehdr); } rtnl_unlock(); if (ret == -EMSGSIZE && skb->len) return skb->len; return ret; }
29 24 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/syscalls.h> #include <linux/time_namespace.h> #include "futex.h" /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. * * Implementation: user-space maintains a per-thread list of locks it * is holding. Upon do_exit(), the kernel carefully walks this list, * and marks all locks that are owned by this thread with the * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is * always manipulated with the lock held, so the list is private and * per-thread. Userspace also maintains a per-thread 'list_op_pending' * field, to allow the kernel to clean up if the thread dies after * acquiring the lock, but just before it could have added itself to * the list. There can only be one such pending lock. */ /** * sys_set_robust_list() - Set the robust-futex list head of a task * @head: pointer to the list-head * @len: length of the list-head, as userspace expects */ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) { /* * The kernel knows only one size for now: */ if (unlikely(len != sizeof(*head))) return -EINVAL; current->robust_list = head; return 0; } /** * sys_get_robust_list() - Get the robust-futex list head of a task * @pid: pid of the process [zero for current task] * @head_ptr: pointer to a list-head pointer, the kernel fills it in * @len_ptr: pointer to a length field, the kernel fills in the header size */ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, size_t __user *, len_ptr) { struct robust_list_head __user *head; unsigned long ret; struct task_struct *p; rcu_read_lock(); ret = -ESRCH; if (!pid) p = current; else { p = find_task_by_vpid(pid); if (!p) goto err_unlock; } ret = -EPERM; if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->robust_list; rcu_read_unlock(); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); err_unlock: rcu_read_unlock(); return ret; } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { unsigned int flags = futex_to_flags(op); int cmd = op & FUTEX_CMD_MASK; if (flags & FLAGS_CLOCKRT) { if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && cmd != FUTEX_LOCK_PI2) return -ENOSYS; } switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAIT_BITSET: return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0); case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: flags |= FLAGS_CLOCKRT; fallthrough; case FUTEX_LOCK_PI2: return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: return futex_lock_pi(uaddr, flags, NULL, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, uaddr2); case FUTEX_CMP_REQUEUE_PI: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1); } return -ENOSYS; } static __always_inline bool futex_cmd_has_timeout(u32 cmd) { switch (cmd) { case FUTEX_WAIT: case FUTEX_LOCK_PI: case FUTEX_LOCK_PI2: case FUTEX_WAIT_BITSET: case FUTEX_WAIT_REQUEUE_PI: return true; } return false; } static __always_inline int futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) { if (!timespec64_valid(ts)) return -EINVAL; *t = timespec64_to_ktime(*ts); if (cmd == FUTEX_WAIT) *t = ktime_add_safe(ktime_get(), *t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); return 0; } SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) { int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; struct timespec64 ts; if (utime && futex_cmd_has_timeout(cmd)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; if (get_timespec64(&ts, utime)) return -EFAULT; ret = futex_init_timeout(cmd, op, &ts, &t); if (ret) return ret; tp = &t; } return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } /** * futex_parse_waitv - Parse a waitv array from userspace * @futexv: Kernel side list of waiters to be filled * @uwaitv: Userspace list to be parsed * @nr_futexes: Length of futexv * @wake: Wake to call when futex is woken * @wake_data: Data for the wake handler * * Return: Error code on failure, 0 on success */ int futex_parse_waitv(struct futex_vector *futexv, struct futex_waitv __user *uwaitv, unsigned int nr_futexes, futex_wake_fn *wake, void *wake_data) { struct futex_waitv aux; unsigned int i; for (i = 0; i < nr_futexes; i++) { unsigned int flags; if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) return -EFAULT; if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved) return -EINVAL; flags = futex2_to_flags(aux.flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, aux.val)) return -EINVAL; futexv[i].w.flags = flags; futexv[i].w.val = aux.val; futexv[i].w.uaddr = aux.uaddr; futexv[i].q = futex_q_init; futexv[i].q.wake = wake; futexv[i].q.wake_data = wake_data; } return 0; } static int futex2_setup_timeout(struct __kernel_timespec __user *timeout, clockid_t clockid, struct hrtimer_sleeper *to) { int flag_clkid = 0, flag_init = 0; struct timespec64 ts; ktime_t time; int ret; if (!timeout) return 0; if (clockid == CLOCK_REALTIME) { flag_clkid = FLAGS_CLOCKRT; flag_init = FUTEX_CLOCK_REALTIME; } if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) return -EINVAL; if (get_timespec64(&ts, timeout)) return -EFAULT; /* * Since there's no opcode for futex_waitv, use * FUTEX_WAIT_BITSET that uses absolute timeout as well */ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); if (ret) return ret; futex_setup_timer(&time, to, flag_clkid, 0); return 0; } static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to) { hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); } /** * sys_futex_waitv - Wait on a list of futexes * @waiters: List of futexes to wait on * @nr_futexes: Length of futexv * @flags: Flag for timeout (monotonic/realtime) * @timeout: Optional absolute timeout. * @clockid: Clock to be used for the timeout, realtime or monotonic. * * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes * if a futex_wake() is performed at any uaddr. The syscall returns immediately * if any waiter has *uaddr != val. *timeout is an optional timeout value for * the operation. Each waiter has individual flags. The `flags` argument for * the syscall should be used solely for specifying the timeout as realtime, if * needed. Flags for private futexes, sizes, etc. should be used on the * individual flags of each waiter. * * Returns the array index of one of the woken futexes. No further information * is provided: any number of other futexes may also have been woken by the * same event, and if more than one futex was woken, the retrned index may * refer to any one of them. (It is not necessaryily the futex with the * smallest index, nor the one most recently woken, nor...) */ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, unsigned int, nr_futexes, unsigned int, flags, struct __kernel_timespec __user *, timeout, clockid_t, clockid) { struct hrtimer_sleeper to; struct futex_vector *futexv; int ret; /* This syscall supports no flags for now */ if (flags) return -EINVAL; if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) return -EINVAL; if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) return ret; futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); if (!futexv) { ret = -ENOMEM; goto destroy_timer; } ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark, NULL); if (!ret) ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); kfree(futexv); destroy_timer: if (timeout) futex2_destroy_timeout(&to); return ret; } /* * sys_futex_wake - Wake a number of futexes * @uaddr: Address of the futex(es) to wake * @mask: bitmask * @nr: Number of the futexes to wake * @flags: FUTEX2 flags * * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the * futex2 family of calls. */ SYSCALL_DEFINE4(futex_wake, void __user *, uaddr, unsigned long, mask, int, nr, unsigned int, flags) { if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; flags = futex2_to_flags(flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, mask)) return -EINVAL; return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask); } /* * sys_futex_wait - Wait on a futex * @uaddr: Address of the futex to wait on * @val: Value of @uaddr * @mask: bitmask * @flags: FUTEX2 flags * @timeout: Optional absolute timeout * @clockid: Clock to be used for the timeout, realtime or monotonic * * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the * futex2 familiy of calls. */ SYSCALL_DEFINE6(futex_wait, void __user *, uaddr, unsigned long, val, unsigned long, mask, unsigned int, flags, struct __kernel_timespec __user *, timeout, clockid_t, clockid) { struct hrtimer_sleeper to; int ret; if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; flags = futex2_to_flags(flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, val) || !futex_validate_input(flags, mask)) return -EINVAL; if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) return ret; ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask); if (timeout) futex2_destroy_timeout(&to); return ret; } /* * sys_futex_requeue - Requeue a waiter from one futex to another * @waiters: array describing the source and destination futex * @flags: unused * @nr_wake: number of futexes to wake * @nr_requeue: number of futexes to requeue * * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the * futex2 family of calls. */ SYSCALL_DEFINE4(futex_requeue, struct futex_waitv __user *, waiters, unsigned int, flags, int, nr_wake, int, nr_requeue) { struct futex_vector futexes[2]; u32 cmpval; int ret; if (flags) return -EINVAL; if (!waiters) return -EINVAL; ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL); if (ret) return ret; cmpval = futexes[0].w.val; return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags, u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags, nr_wake, nr_requeue, &cmpval, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, compat_size_t, len) { if (unlikely(len != sizeof(*head))) return -EINVAL; current->compat_robust_list = head; return 0; } COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, compat_uptr_t __user *, head_ptr, compat_size_t __user *, len_ptr) { struct compat_robust_list_head __user *head; unsigned long ret; struct task_struct *p; rcu_read_lock(); ret = -ESRCH; if (!pid) p = current; else { p = find_task_by_vpid(pid); if (!p) goto err_unlock; } ret = -EPERM; if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->compat_robust_list; rcu_read_unlock(); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(ptr_to_compat(head), head_ptr); err_unlock: rcu_read_unlock(); return ret; } #endif /* CONFIG_COMPAT */ #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; struct timespec64 ts; if (utime && futex_cmd_has_timeout(cmd)) { if (get_old_timespec32(&ts, utime)) return -EFAULT; ret = futex_init_timeout(cmd, op, &ts, &t); if (ret) return ret; tp = &t; } return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */
6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 6 6 6 7 7 7 7 7 7 7 7 7 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 // SPDX-License-Identifier: GPL-2.0-only /* * Scanning implementation * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/sch_generic.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/random.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "mesh.h" #define IEEE80211_PROBE_DELAY (HZ / 33) #define IEEE80211_CHANNEL_TIME (HZ / 33) #define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 9) void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss) { if (!bss) return; cfg80211_put_bss(local->hw.wiphy, container_of((void *)bss, struct cfg80211_bss, priv)); } static bool is_uapsd_supported(struct ieee802_11_elems *elems) { u8 qos_info; if (elems->wmm_info && elems->wmm_info_len == 7 && elems->wmm_info[5] == 1) qos_info = elems->wmm_info[6]; else if (elems->wmm_param && elems->wmm_param_len == 24 && elems->wmm_param[5] == 1) qos_info = elems->wmm_param[6]; else /* no valid wmm information or parameter element found */ return false; return qos_info & IEEE80211_WMM_IE_AP_QOSINFO_UAPSD; } struct inform_bss_update_data { struct ieee80211_rx_status *rx_status; bool beacon; }; void ieee80211_inform_bss(struct wiphy *wiphy, struct cfg80211_bss *cbss, const struct cfg80211_bss_ies *ies, void *data) { struct ieee80211_local *local = wiphy_priv(wiphy); struct inform_bss_update_data *update_data = data; struct ieee80211_bss *bss = (void *)cbss->priv; struct ieee80211_rx_status *rx_status; struct ieee802_11_elems *elems; int clen, srlen; /* This happens while joining an IBSS */ if (!update_data) return; elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL); if (!elems) return; rx_status = update_data->rx_status; if (update_data->beacon) bss->device_ts_beacon = rx_status->device_timestamp; else bss->device_ts_presp = rx_status->device_timestamp; if (elems->parse_error) { if (update_data->beacon) bss->corrupt_data |= IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data |= IEEE80211_BSS_CORRUPT_PROBE_RESP; } else { if (update_data->beacon) bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_PROBE_RESP; } /* save the ERP value so that it is available at association time */ if (elems->erp_info && (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_ERP))) { bss->erp_value = elems->erp_info[0]; bss->has_erp_value = true; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_ERP; } /* replace old supported rates if we get new values */ if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_RATES)) { srlen = 0; if (elems->supp_rates) { clen = IEEE80211_MAX_SUPP_RATES; if (clen > elems->supp_rates_len) clen = elems->supp_rates_len; memcpy(bss->supp_rates, elems->supp_rates, clen); srlen += clen; } if (elems->ext_supp_rates) { clen = IEEE80211_MAX_SUPP_RATES - srlen; if (clen > elems->ext_supp_rates_len) clen = elems->ext_supp_rates_len; memcpy(bss->supp_rates + srlen, elems->ext_supp_rates, clen); srlen += clen; } if (srlen) { bss->supp_rates_len = srlen; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_RATES; } } if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_WMM)) { bss->wmm_used = elems->wmm_param || elems->wmm_info; bss->uapsd_supported = is_uapsd_supported(elems); if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_WMM; } if (update_data->beacon) { struct ieee80211_supported_band *sband = local->hw.wiphy->bands[rx_status->band]; if (!(rx_status->encoding == RX_ENC_HT) && !(rx_status->encoding == RX_ENC_VHT)) bss->beacon_rate = &sband->bitrates[rx_status->rate_idx]; } if (elems->vht_cap_elem) bss->vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); else bss->vht_cap_info = 0; kfree(elems); } struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_channel *channel) { bool beacon = ieee80211_is_beacon(mgmt->frame_control) || ieee80211_is_s1g_beacon(mgmt->frame_control); struct cfg80211_bss *cbss; struct inform_bss_update_data update_data = { .rx_status = rx_status, .beacon = beacon, }; struct cfg80211_inform_bss bss_meta = { .boottime_ns = rx_status->boottime_ns, .drv_data = (void *)&update_data, }; bool signal_valid; struct ieee80211_sub_if_data *scan_sdata; if (rx_status->flag & RX_FLAG_NO_SIGNAL_VAL) bss_meta.signal = 0; /* invalid signal indication */ else if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) bss_meta.signal = rx_status->signal * 100; else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal; bss_meta.chan = channel; rcu_read_lock(); scan_sdata = rcu_dereference(local->scan_sdata); if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION && scan_sdata->vif.cfg.assoc && ieee80211_have_rx_timestamp(rx_status)) { struct ieee80211_bss_conf *link_conf = NULL; /* for an MLO connection, set the TSF data only in case we have * an indication on which of the links the frame was received */ if (ieee80211_vif_is_mld(&scan_sdata->vif)) { if (rx_status->link_valid) { s8 link_id = rx_status->link_id; link_conf = rcu_dereference(scan_sdata->vif.link_conf[link_id]); } } else { link_conf = &scan_sdata->vif.bss_conf; } if (link_conf) { bss_meta.parent_tsf = ieee80211_calculate_rx_timestamp(local, rx_status, len + FCS_LEN, 24); ether_addr_copy(bss_meta.parent_bssid, link_conf->bssid); } } rcu_read_unlock(); cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, len, GFP_ATOMIC); if (!cbss) return NULL; /* In case the signal is invalid update the status */ signal_valid = channel == cbss->channel; if (!signal_valid) rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL; return (void *)cbss->priv; } static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, u32 scan_flags, const u8 *da) { if (!sdata) return false; /* accept broadcast on 6 GHz and for OCE */ if (is_broadcast_ether_addr(da) && (channel->band == NL80211_BAND_6GHZ || scan_flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP)) return true; if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR) return true; return ether_addr_equal(da, sdata->vif.addr); } void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_bss *bss; struct ieee80211_channel *channel; size_t min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); if (!ieee80211_is_probe_resp(mgmt->frame_control) && !ieee80211_is_beacon(mgmt->frame_control) && !ieee80211_is_s1g_beacon(mgmt->frame_control)) return; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_short_beacon.variable); else min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon); } if (skb->len < min_hdr_len) return; if (test_and_clear_bit(SCAN_BEACON_WAIT, &local->scanning)) { /* * we were passive scanning because of radar/no-IR, but * the beacon/proberesp rx gives us an opportunity to upgrade * to active scan */ set_bit(SCAN_BEACON_DONE, &local->scanning); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } channel = ieee80211_get_channel_khz(local->hw.wiphy, ieee80211_rx_status_to_khz(rx_status)); if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; if (ieee80211_is_probe_resp(mgmt->frame_control)) { struct ieee80211_sub_if_data *sdata1, *sdata2; struct cfg80211_scan_request *scan_req; struct cfg80211_sched_scan_request *sched_scan_req; u32 scan_req_flags = 0, sched_scan_req_flags = 0; sdata1 = rcu_dereference(local->scan_sdata); sdata2 = rcu_dereference(local->sched_scan_sdata); if (likely(!sdata1 && !sdata2)) return; scan_req = rcu_dereference(local->scan_req); sched_scan_req = rcu_dereference(local->sched_scan_req); if (scan_req) scan_req_flags = scan_req->flags; if (sched_scan_req) sched_scan_req_flags = sched_scan_req->flags; /* ignore ProbeResp to foreign address or non-bcast (OCE) * unless scanning with randomised address */ if (!ieee80211_scan_accept_presp(sdata1, channel, scan_req_flags, mgmt->da) && !ieee80211_scan_accept_presp(sdata2, channel, sched_scan_req_flags, mgmt->da)) return; } else { /* Beacons are expected only with broadcast address */ if (!is_broadcast_ether_addr(mgmt->da)) return; } /* Do not update the BSS table in case of only monitor interfaces */ if (local->open_count == local->monitors) return; bss = ieee80211_bss_info_update(local, rx_status, mgmt, skb->len, channel); if (bss) ieee80211_rx_bss_put(local, bss); } static void ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef) { memset(chandef, 0, sizeof(*chandef)); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; } /* return false if no more work */ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; int i, ielen; u32 *n_chans; u32 flags = 0; req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) return false; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { local->hw_scan_req->req.n_channels = req->n_channels; for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } } else { do { if (local->hw_scan_band == NUM_NL80211_BANDS) return false; n_chans = &local->hw_scan_req->req.n_channels; *n_chans = 0; for (i = 0; i < req->n_channels; i++) { if (req->channels[i]->band != local->hw_scan_band) continue; local->hw_scan_req->req.channels[(*n_chans)++] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } local->hw_scan_band++; } while (!*n_chans); } ieee80211_prepare_scan_chandef(&chandef); if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ielen = ieee80211_build_preq_ies(sdata, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, req->ie, req->ie_len, bands_used, req->rates, &chandef, flags); if (ielen < 0) return false; local->hw_scan_req->req.ie_len = ielen; local->hw_scan_req->req.no_cck = req->no_cck; ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); ether_addr_copy(local->hw_scan_req->req.mac_addr_mask, req->mac_addr_mask); ether_addr_copy(local->hw_scan_req->req.bssid, req->bssid); return true; } static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) { struct ieee80211_local *local = hw_to_local(hw); bool hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning); bool was_scanning = local->scanning; struct cfg80211_scan_request *scan_req; struct ieee80211_sub_if_data *scan_sdata; struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * It's ok to abort a not-yet-running scan (that * we have one at all will be verified by checking * local->scan_req next), but not to complete it * successfully. */ if (WARN_ON(!local->scanning && !aborted)) aborted = true; if (WARN_ON(!local->scan_req)) return; scan_sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (hw_scan && !aborted && !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && ieee80211_prep_hw_scan(scan_sdata)) { int rc; rc = drv_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)), local->hw_scan_req); if (rc == 0) return; /* HW scan failed and is going to be reported as aborted, * so clear old scan info. */ memset(&local->scan_info, 0, sizeof(local->scan_info)); aborted = true; } kfree(local->hw_scan_req); local->hw_scan_req = NULL; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; local->scan_chandef.chan = NULL; synchronize_rcu(); if (scan_req != local->int_scan_req) { local->scan_info.aborted = aborted; cfg80211_scan_done(scan_req, &local->scan_info); } /* Set power back to normal operating levels. */ ieee80211_hw_conf_chan(local); if (!hw_scan && was_scanning) { ieee80211_configure_filter(local); drv_sw_scan_complete(local, scan_sdata); ieee80211_offchannel_return(local); } ieee80211_recalc_idle(local); ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); /* Requeue all the work that might have been ignored while * the scan was in progress; if there was none this will * just be a no-op for the particular interface. */ list_for_each_entry(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } if (was_scanning) ieee80211_start_next_roc(local); } void ieee80211_scan_completed(struct ieee80211_hw *hw, struct cfg80211_scan_info *info) { struct ieee80211_local *local = hw_to_local(hw); trace_api_scan_completed(local, info->aborted); set_bit(SCAN_COMPLETED, &local->scanning); if (info->aborted) set_bit(SCAN_ABORTED, &local->scanning); memcpy(&local->scan_info, info, sizeof(*info)); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } EXPORT_SYMBOL(ieee80211_scan_completed); static int ieee80211_start_sw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { /* Software scan is not supported in multi-channel cases */ if (!local->emulate_chanctx) return -EOPNOTSUPP; /* * Hardware/driver doesn't support hw_scan, so use software * scanning instead. First send a nullfunc frame with power save * bit on so that AP will buffer the frames for us while we are not * listening, then send probe requests to each channel and wait for * the responses. After all channels are scanned, tune back to the * original channel and send a nullfunc frame with power save bit * off to trigger the AP to send us all the buffered frames. * * Note that while local->sw_scanning is true everything else but * nullfunc frames and probe requests will be dropped in * ieee80211_tx_h_check_assoc(). */ drv_sw_scan_start(local, sdata, local->scan_addr); local->leave_oper_channel_time = jiffies; local->next_scan_state = SCAN_DECISION; local->scan_channel_idx = 0; ieee80211_offchannel_stop_vifs(local); /* ensure nullfunc is transmitted before leaving operating channel */ ieee80211_flush_queues(local, NULL, false); ieee80211_configure_filter(local); /* We need to set power level at maximum rate for scanning. */ ieee80211_hw_conf_chan(local); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); return 0; } static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *sdata_iter; unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_is_radar_required(local)) return true; if (!regulatory_pre_cac_allowed(local->hw.wiphy)) return false; list_for_each_entry(sdata_iter, &local->interfaces, list) { for_each_valid_link(&sdata_iter->wdev, link_id) if (sdata_iter->wdev.links[link_id].cac_started) return false; } return true; } static bool ieee80211_can_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { if (!__ieee80211_can_leave_ch(sdata)) return false; if (!list_empty(&local->roc_list)) return false; if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.flags & IEEE80211_STA_CONNECTION_POLL) return false; return true; } void ieee80211_run_deferred_scan(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); if (!local->scan_req || local->scanning) return; if (!ieee80211_can_scan(local, rcu_dereference_protected( local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)))) return; wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, round_jiffies_relative(0)); } static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 ratemask, u32 flags, u32 tx_flags, struct ieee80211_channel *channel) { struct sk_buff *skb; skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, ssid, ssid_len, ie, ie_len, flags); if (skb) { if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 sn = get_random_u16(); info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; hdr->seq_ctrl = cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); } IEEE80211_SKB_CB(skb)->flags |= tx_flags; IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_CTRL_DONT_USE_RATE_MASK; ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); } } static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, unsigned long *next_delay) { int i; struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; enum nl80211_band band = local->hw.conf.chandef.chan->band; u32 flags = 0, tx_flags; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (scan_req->no_cck) tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE; if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN) flags |= IEEE80211_PROBE_FLAG_RANDOM_SN; sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); for (i = 0; i < scan_req->n_ssids; i++) ieee80211_send_scan_probe_req( sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, scan_req->rates[band], flags, tx_flags, local->hw.conf.chandef.chan); /* * After sending probe requests, wait for probe responses * on the channel. */ *next_delay = msecs_to_jiffies(scan_req->duration) > IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME ? msecs_to_jiffies(scan_req->duration) - IEEE80211_PROBE_DELAY : IEEE80211_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; } static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; bool hw_scan = local->ops->hw_scan; int rc; lockdep_assert_wiphy(local->hw.wiphy); if (local->scan_req) return -EBUSY; /* For an MLO connection, if a link ID was specified, validate that it * is indeed active. */ if (ieee80211_vif_is_mld(&sdata->vif) && req->tsf_report_link_id >= 0 && !(sdata->vif.active_links & BIT(req->tsf_report_link_id))) return -EINVAL; if (!__ieee80211_can_leave_ch(sdata)) return -EBUSY; if (!ieee80211_can_scan(local, sdata)) { /* wait for the work to finish/time out */ rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); return 0; } again: if (hw_scan) { u8 *ies; local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { int i, n_bands = 0; u8 bands_counted = 0; for (i = 0; i < req->n_channels; i++) { if (bands_counted & BIT(req->channels[i]->band)) continue; bands_counted |= BIT(req->channels[i]->band); n_bands++; } local->hw_scan_ies_bufsize *= n_bands; } local->hw_scan_req = kmalloc(struct_size(local->hw_scan_req, req.channels, req->n_channels) + local->hw_scan_ies_bufsize, GFP_KERNEL); if (!local->hw_scan_req) return -ENOMEM; local->hw_scan_req->req.ssids = req->ssids; local->hw_scan_req->req.n_ssids = req->n_ssids; /* None of the channels are actually set * up but let UBSAN know the boundaries. */ local->hw_scan_req->req.n_channels = req->n_channels; ies = (u8 *)local->hw_scan_req + sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]); local->hw_scan_req->req.ie = ies; local->hw_scan_req->req.flags = req->flags; eth_broadcast_addr(local->hw_scan_req->req.bssid); local->hw_scan_req->req.duration = req->duration; local->hw_scan_req->req.duration_mandatory = req->duration_mandatory; local->hw_scan_req->req.tsf_report_link_id = req->tsf_report_link_id; local->hw_scan_band = 0; local->hw_scan_req->req.n_6ghz_params = req->n_6ghz_params; local->hw_scan_req->req.scan_6ghz_params = req->scan_6ghz_params; local->hw_scan_req->req.scan_6ghz = req->scan_6ghz; /* * After allocating local->hw_scan_req, we must * go through until ieee80211_prep_hw_scan(), so * anything that might be changed here and leave * this function early must not go after this * allocation. */ } rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); if (req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) get_random_mask_addr(local->scan_addr, req->mac_addr, req->mac_addr_mask); else memcpy(local->scan_addr, sdata->vif.addr, ETH_ALEN); if (hw_scan) { __set_bit(SCAN_HW_SCANNING, &local->scanning); } else if ((req->n_channels == 1) && (req->channels[0] == local->hw.conf.chandef.chan)) { /* * If we are scanning only on the operating channel * then we do not need to stop normal activities */ unsigned long next_delay; __set_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); ieee80211_recalc_idle(local); /* Notify driver scan is starting, keep order of operations * same as normal software scan, in case that matters. */ drv_sw_scan_start(local, sdata, local->scan_addr); ieee80211_configure_filter(local); /* accept probe-responses */ /* We need to ensure power level is at max for scanning. */ ieee80211_hw_conf_chan(local); if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !req->n_ssids) { next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; if (req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); } else { ieee80211_scan_state_send_probe(local, &next_delay); next_delay = IEEE80211_CHANNEL_TIME; } /* Now, just wait a bit and we are all done! */ wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return 0; } else { /* Do normal software scan */ __set_bit(SCAN_SW_SCANNING, &local->scanning); } ieee80211_recalc_idle(local); if (hw_scan) { WARN_ON(!ieee80211_prep_hw_scan(sdata)); rc = drv_hw_scan(local, sdata, local->hw_scan_req); } else { rc = ieee80211_start_sw_scan(local, sdata); } if (rc) { kfree(local->hw_scan_req); local->hw_scan_req = NULL; local->scanning = 0; ieee80211_recalc_idle(local); local->scan_req = NULL; RCU_INIT_POINTER(local->scan_sdata, NULL); } if (hw_scan && rc == 1) { /* * we can't fall back to software for P2P-GO * as it must update NoA etc. */ if (ieee80211_vif_type_p2p(&sdata->vif) == NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; hw_scan = false; goto again; } return rc; } static unsigned long ieee80211_scan_get_channel_time(struct ieee80211_channel *chan) { /* * TODO: channel switching also consumes quite some time, * add that delay as well to get a better estimation */ if (chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) return IEEE80211_PASSIVE_CHANNEL_TIME; return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME; } static void ieee80211_scan_state_decision(struct ieee80211_local *local, unsigned long *next_delay) { bool associated = false; bool tx_empty = true; bool bad_latency; struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *next_chan; enum mac80211_scan_state next_scan_state; struct cfg80211_scan_request *scan_req; lockdep_assert_wiphy(local->hw.wiphy); /* * check if at least one STA interface is associated, * check if at least one STA interface has pending tx frames * and grab the lowest used beacon interval */ list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) { if (sdata->u.mgd.associated) { associated = true; if (!qdisc_all_tx_empty(sdata->dev)) { tx_empty = false; break; } } } } scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); next_chan = scan_req->channels[local->scan_channel_idx]; /* * we're currently scanning a different channel, let's * see if we can scan another channel without interfering * with the current traffic situation. * * Keep good latency, do not stay off-channel more than 125 ms. */ bad_latency = time_after(jiffies + ieee80211_scan_get_channel_time(next_chan), local->leave_oper_channel_time + HZ / 8); if (associated && !tx_empty) { if (scan_req->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) next_scan_state = SCAN_ABORT; else next_scan_state = SCAN_SUSPEND; } else if (associated && bad_latency) { next_scan_state = SCAN_SUSPEND; } else { next_scan_state = SCAN_SET_CHANNEL; } local->next_scan_state = next_scan_state; *next_delay = 0; } static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, unsigned long *next_delay) { int skip; struct ieee80211_channel *chan; struct cfg80211_scan_request *scan_req; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); skip = 0; chan = scan_req->channels[local->scan_channel_idx]; local->scan_chandef.chan = chan; local->scan_chandef.center_freq1 = chan->center_freq; local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; /* For scanning on the S1G band, detect the channel width according to * the channel being scanned. */ if (chan->band == NL80211_BAND_S1GHZ) { local->scan_chandef.width = ieee80211_s1g_channel_width(chan); goto set_channel; } /* If scanning on oper channel, use whatever channel-type * is currently in use. */ if (chan == local->hw.conf.chandef.chan) local->scan_chandef = local->hw.conf.chandef; else local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT; set_channel: if (ieee80211_hw_conf_chan(local)) skip = 1; /* advance state machine to next channel/band */ local->scan_channel_idx++; if (skip) { /* if we skip this channel return to the decision state */ local->next_scan_state = SCAN_DECISION; return; } /* * Probe delay is used to update the NAV, cf. 11.1.3.2.2 * (which unfortunately doesn't say _why_ step a) is done, * but it waits for the probe delay or until a frame is * received - and the received frame would update the NAV). * For now, we do not support waiting until a frame is * received. * * In any case, it is not necessary for a passive scan. */ if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !scan_req->n_ssids) { *next_delay = max(msecs_to_jiffies(scan_req->duration), IEEE80211_PASSIVE_CHANNEL_TIME); local->next_scan_state = SCAN_DECISION; if (scan_req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); return; } /* active scan, send probes */ *next_delay = IEEE80211_PROBE_DELAY; local->next_scan_state = SCAN_SEND_PROBE; } static void ieee80211_scan_state_suspend(struct ieee80211_local *local, unsigned long *next_delay) { /* switch back to the operating channel */ local->scan_chandef.chan = NULL; ieee80211_hw_conf_chan(local); /* disable PS */ ieee80211_offchannel_return(local); *next_delay = HZ / 5; /* afterwards, resume scan & go to next channel */ local->next_scan_state = SCAN_RESUME; } static void ieee80211_scan_state_resume(struct ieee80211_local *local, unsigned long *next_delay) { ieee80211_offchannel_stop_vifs(local); if (local->ops->flush) { ieee80211_flush_queues(local, NULL, false); *next_delay = 0; } else *next_delay = HZ / 10; /* remember when we left the operating channel */ local->leave_oper_channel_time = jiffies; /* advance to the next channel to be scanned */ local->next_scan_state = SCAN_SET_CHANNEL; } void ieee80211_scan_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, scan_work.work); struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; unsigned long next_delay = 0; bool aborted; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_can_run_worker(local)) { aborted = true; goto out_complete; } sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); /* When scanning on-channel, the first-callback means completed. */ if (test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (!sdata || !scan_req) return; if (!local->scanning) { int rc; RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); rc = __ieee80211_start_scan(sdata, scan_req); if (!rc) return; /* need to complete scan in cfg80211 */ rcu_assign_pointer(local->scan_req, scan_req); aborted = true; goto out_complete; } clear_bit(SCAN_BEACON_WAIT, &local->scanning); /* * as long as no delay is required advance immediately * without scheduling a new work */ do { if (!ieee80211_sdata_running(sdata)) { aborted = true; goto out_complete; } if (test_and_clear_bit(SCAN_BEACON_DONE, &local->scanning) && local->next_scan_state == SCAN_DECISION) local->next_scan_state = SCAN_SEND_PROBE; switch (local->next_scan_state) { case SCAN_DECISION: /* if no more bands/channels left, complete scan */ if (local->scan_channel_idx >= scan_req->n_channels) { aborted = false; goto out_complete; } ieee80211_scan_state_decision(local, &next_delay); break; case SCAN_SET_CHANNEL: ieee80211_scan_state_set_channel(local, &next_delay); break; case SCAN_SEND_PROBE: ieee80211_scan_state_send_probe(local, &next_delay); break; case SCAN_SUSPEND: ieee80211_scan_state_suspend(local, &next_delay); break; case SCAN_RESUME: ieee80211_scan_state_resume(local, &next_delay); break; case SCAN_ABORT: aborted = true; goto out_complete; } } while (next_delay == 0); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return; out_complete: __ieee80211_scan_completed(&local->hw, aborted); } int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { lockdep_assert_wiphy(sdata->local->hw.wiphy); return __ieee80211_start_scan(sdata, req); } int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, struct ieee80211_channel **channels, unsigned int n_channels) { struct ieee80211_local *local = sdata->local; int i, n_ch = 0; enum nl80211_band band; lockdep_assert_wiphy(local->hw.wiphy); /* busy scanning */ if (local->scan_req) return -EBUSY; /* fill internal scan request */ if (!channels) { int max_n; for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band] || band == NL80211_BAND_6GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; for (i = 0; i < max_n; i++) { struct ieee80211_channel *tmp_ch = &local->hw.wiphy->bands[band]->channels[i]; if (tmp_ch->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED) || !cfg80211_wdev_channel_allowed(&sdata->wdev, tmp_ch)) continue; local->int_scan_req->channels[n_ch] = tmp_ch; n_ch++; } } if (WARN_ON_ONCE(n_ch == 0)) return -EINVAL; local->int_scan_req->n_channels = n_ch; } else { for (i = 0; i < n_channels; i++) { if (channels[i]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED) || !cfg80211_wdev_channel_allowed(&sdata->wdev, channels[i])) continue; local->int_scan_req->channels[n_ch] = channels[i]; n_ch++; } if (n_ch == 0) return -EINVAL; local->int_scan_req->n_channels = n_ch; } local->int_scan_req->ssids = &local->scan_ssid; local->int_scan_req->n_ssids = 1; memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN); local->int_scan_req->ssids[0].ssid_len = ssid_len; return __ieee80211_start_scan(sdata, sdata->local->int_scan_req); } void ieee80211_scan_cancel(struct ieee80211_local *local) { /* ensure a new scan cannot be queued */ lockdep_assert_wiphy(local->hw.wiphy); /* * We are canceling software scan, or deferred scan that was not * yet really started (see __ieee80211_start_scan ). * * Regarding hardware scan: * - we can not call __ieee80211_scan_completed() as when * SCAN_HW_SCANNING bit is set this function change * local->hw_scan_req to operate on 5G band, what race with * driver which can use local->hw_scan_req * * - we can not cancel scan_work since driver can schedule it * by ieee80211_scan_completed(..., true) to finish scan * * Hence we only call the cancel_hw_scan() callback, but the low-level * driver is still responsible for calling ieee80211_scan_completed() * after the scan was completed/aborted. */ if (!local->scan_req) return; /* * We have a scan running and the driver already reported completion, * but the worker hasn't run yet or is stuck on the mutex - mark it as * cancelled. */ if (test_bit(SCAN_HW_SCANNING, &local->scanning) && test_bit(SCAN_COMPLETED, &local->scanning)) { set_bit(SCAN_HW_CANCELLED, &local->scanning); return; } if (test_bit(SCAN_HW_SCANNING, &local->scanning)) { /* * Make sure that __ieee80211_scan_completed doesn't trigger a * scan on another band. */ set_bit(SCAN_HW_CANCELLED, &local->scanning); if (local->ops->cancel_hw_scan) drv_cancel_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx))); return; } wiphy_delayed_work_cancel(local->hw.wiphy, &local->scan_work); /* and clean up */ memset(&local->scan_info, 0, sizeof(local->scan_info)); __ieee80211_scan_completed(&local->hw, true); } int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_scan_ies sched_scan_ies = {}; struct cfg80211_chan_def chandef; int ret, i, iebufsz, num_bands = 0; u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; u32 flags = 0; lockdep_assert_wiphy(local->hw.wiphy); iebufsz = local->scan_ies_len + req->ie_len; if (!local->ops->sched_scan_start) return -EOPNOTSUPP; for (i = 0; i < NUM_NL80211_BANDS; i++) { if (local->hw.wiphy->bands[i]) { bands_used |= BIT(i); rate_masks[i] = (u32) -1; num_bands++; } } if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ie = kcalloc(iebufsz, num_bands, GFP_KERNEL); if (!ie) { ret = -ENOMEM; goto out; } ieee80211_prepare_scan_chandef(&chandef); ret = ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, req->ie_len, bands_used, rate_masks, &chandef, flags); if (ret < 0) goto error; ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { rcu_assign_pointer(local->sched_scan_sdata, sdata); rcu_assign_pointer(local->sched_scan_req, req); } error: kfree(ie); out: if (ret) { /* Clean in case of failure after HW restart or upon resume. */ RCU_INIT_POINTER(local->sched_scan_sdata, NULL); RCU_INIT_POINTER(local->sched_scan_req, NULL); } return ret; } int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (rcu_access_pointer(local->sched_scan_sdata)) return -EBUSY; return __ieee80211_request_sched_scan_start(sdata, req); } int ieee80211_request_sched_scan_stop(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sched_scan_sdata; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->sched_scan_stop) return -EOPNOTSUPP; /* We don't want to restart sched scan anymore. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (sched_scan_sdata) { ret = drv_sched_scan_stop(local, sched_scan_sdata); if (!ret) RCU_INIT_POINTER(local->sched_scan_sdata, NULL); } return ret; } void ieee80211_sched_scan_results(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_results(local); cfg80211_sched_scan_results(hw->wiphy, 0); } EXPORT_SYMBOL(ieee80211_sched_scan_results); void ieee80211_sched_scan_end(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); if (!rcu_access_pointer(local->sched_scan_sdata)) return; RCU_INIT_POINTER(local->sched_scan_sdata, NULL); /* If sched scan was aborted by the driver. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); } void ieee80211_sched_scan_stopped_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, sched_scan_stopped_work); ieee80211_sched_scan_end(local); } void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_stopped(local); /* * this shouldn't really happen, so for simplicity * simply ignore it, and let mac80211 reconfigure * the sched scan later on. */ if (local->in_reconfig) return; wiphy_work_queue(hw->wiphy, &local->sched_scan_stopped_work); } EXPORT_SYMBOL(ieee80211_sched_scan_stopped);
26 26 31 31 31 31 5 31 30 31 36 36 36 36 36 36 36 36 36 36 41 41 34 7 7 37 37 37 37 37 36 36 37 34 7 37 37 37 12 37 13 28 41 41 34 29 29 1 34 34 33 34 33 34 34 33 56 56 56 56 56 55 56 56 56 56 56 13 7 9 9 9 9 12 13 13 13 13 13 9 7 7 7 13 23 32 32 32 32 32 31 32 32 22 22 13 13 13 13 13 13 13 13 13 13 13 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2024 Intel Corporation * * utilities for mac80211 */ #include <net/mac80211.h> #include <linux/netdevice.h> #include <linux/export.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/bitmap.h> #include <linux/crc32.h> #include <net/net_namespace.h> #include <net/cfg80211.h> #include <net/rtnetlink.h> #include <kunit/visibility.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "mesh.h" #include "wme.h" #include "led.h" #include "wep.h" /* privid for wiphys to determine whether they belong to us or not */ const void *const mac80211_wiphy_privid = &mac80211_wiphy_privid; struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy) { struct ieee80211_local *local; local = wiphy_priv(wiphy); return &local->hw; } EXPORT_SYMBOL(wiphy_to_ieee80211_hw); const struct ieee80211_conn_settings ieee80211_conn_settings_unlimited = { .mode = IEEE80211_CONN_MODE_EHT, .bw_limit = IEEE80211_CONN_BW_LIMIT_320, }; u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, enum nl80211_iftype type) { __le16 fc = hdr->frame_control; if (ieee80211_is_data(fc)) { if (len < 24) /* drop incorrect hdr len (data) */ return NULL; if (ieee80211_has_a4(fc)) return NULL; if (ieee80211_has_tods(fc)) return hdr->addr1; if (ieee80211_has_fromds(fc)) return hdr->addr2; return hdr->addr3; } if (ieee80211_is_s1g_beacon(fc)) { struct ieee80211_ext *ext = (void *) hdr; return ext->u.s1g_beacon.sa; } if (ieee80211_is_mgmt(fc)) { if (len < 24) /* drop incorrect hdr len (mgmt) */ return NULL; return hdr->addr3; } if (ieee80211_is_ctl(fc)) { if (ieee80211_is_pspoll(fc)) return hdr->addr1; if (ieee80211_is_back_req(fc)) { switch (type) { case NL80211_IFTYPE_STATION: return hdr->addr2; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: return hdr->addr1; default: break; /* fall through to the return */ } } } return NULL; } EXPORT_SYMBOL(ieee80211_get_bssid); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_hdr *hdr; skb_queue_walk(&tx->skbs, skb) { hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); } } int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble) { int dur; /* calculate duration (in microseconds, rounded up to next higher * integer if it includes a fractional microsecond) to send frame of * len bytes (does not include FCS) at the given rate. Duration will * also include SIFS. * * rate is in 100 kbps, so divident is multiplied by 10 in the * DIV_ROUND_UP() operations. */ if (band == NL80211_BAND_5GHZ || erp) { /* * OFDM: * * N_DBPS = DATARATE x 4 * N_SYM = Ceiling((16+8xLENGTH+6) / N_DBPS) * (16 = SIGNAL time, 6 = tail bits) * TXTIME = T_PREAMBLE + T_SIGNAL + T_SYM x N_SYM + Signal Ext * * T_SYM = 4 usec * 802.11a - 18.5.2: aSIFSTime = 16 usec * 802.11g - 19.8.4: aSIFSTime = 10 usec + * signal ext = 6 usec */ dur = 16; /* SIFS + signal ext */ dur += 16; /* IEEE 802.11-2012 18.3.2.4: T_PREAMBLE = 16 usec */ dur += 4; /* IEEE 802.11-2012 18.3.2.4: T_SIGNAL = 4 usec */ /* rates should already consider the channel bandwidth, * don't apply divisor again. */ dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10, 4 * rate); /* T_SYM x N_SYM */ } else { /* * 802.11b or 802.11g with 802.11b compatibility: * 18.3.4: TXTIME = PreambleLength + PLCPHeaderTime + * Ceiling(((LENGTH+PBCC)x8)/DATARATE). PBCC=0. * * 802.11 (DS): 15.3.3, 802.11b: 18.3.4 * aSIFSTime = 10 usec * aPreambleLength = 144 usec or 72 usec with short preamble * aPLCPHeaderLength = 48 usec or 24 usec with short preamble */ dur = 10; /* aSIFSTime = 10 usec */ dur += short_preamble ? (72 + 24) : (144 + 48); dur += DIV_ROUND_UP(8 * (len + 4) * 10, rate); } return dur; } /* Exported duration function for driver use */ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_band band, size_t frame_len, struct ieee80211_rate *rate) { struct ieee80211_sub_if_data *sdata; u16 dur; int erp; bool short_preamble = false; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } dur = ieee80211_frame_duration(band, frame_len, rate->bitrate, erp, short_preamble); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_generic_frame_duration); __le16 ieee80211_rts_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; struct ieee80211_sub_if_data *sdata; bool short_preamble; int erp, bitrate; u16 dur; struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[frame_txctl->band]; short_preamble = false; rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } bitrate = rate->bitrate; /* CTS duration */ dur = ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); /* Data frame duration */ dur += ieee80211_frame_duration(sband->band, frame_len, bitrate, erp, short_preamble); /* ACK duration */ dur += ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_rts_duration); __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; struct ieee80211_sub_if_data *sdata; bool short_preamble; int erp, bitrate; u16 dur; struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[frame_txctl->band]; short_preamble = false; rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } bitrate = rate->bitrate; /* Data frame duration */ dur = ieee80211_frame_duration(sband->band, frame_len, bitrate, erp, short_preamble); if (!(frame_txctl->flags & IEEE80211_TX_CTL_NO_ACK)) { /* ACK duration */ dur += ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); } return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_ctstoself_duration); static void wake_tx_push_queue(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_txq *queue) { struct ieee80211_tx_control control = { .sta = queue->sta, }; struct sk_buff *skb; while (1) { skb = ieee80211_tx_dequeue(&local->hw, queue); if (!skb) break; drv_tx(local, &control, skb); } } /* wake_tx_queue handler for driver not implementing a custom one*/ void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); struct ieee80211_txq *queue; spin_lock(&local->handle_wake_tx_queue_lock); /* Use ieee80211_next_txq() for airtime fairness accounting */ ieee80211_txq_schedule_start(hw, txq->ac); while ((queue = ieee80211_next_txq(hw, txq->ac))) { wake_tx_push_queue(local, sdata, queue); ieee80211_return_txq(hw, queue, false); } ieee80211_txq_schedule_end(hw, txq->ac); spin_unlock(&local->handle_wake_tx_queue_lock); } EXPORT_SYMBOL(ieee80211_handle_wake_tx_queue); static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) { struct ieee80211_local *local = sdata->local; struct ieee80211_vif *vif = &sdata->vif; struct fq *fq = &local->fq; struct ps_data *ps = NULL; struct txq_info *txqi; struct sta_info *sta; int i; local_bh_disable(); spin_lock(&fq->lock); if (!test_bit(SDATA_STATE_RUNNING, &sdata->state)) goto out; if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct ieee80211_txq *txq = sta->sta.txq[i]; if (!txq) continue; txqi = to_txq_info(txq); if (ac != txq->ac) continue; if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &txqi->flags)) continue; spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); spin_lock(&fq->lock); } } if (!vif->txq) goto out; txqi = to_txq_info(vif->txq); if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) || (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac) goto out; spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); local_bh_enable(); return; out: spin_unlock(&fq->lock); local_bh_enable(); } static void __releases(&local->queue_stop_reason_lock) __acquires(&local->queue_stop_reason_lock) _ieee80211_wake_txqs(struct ieee80211_local *local, unsigned long *flags) { struct ieee80211_sub_if_data *sdata; int n_acs = IEEE80211_NUM_ACS; int i; rcu_read_lock(); if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; for (i = 0; i < local->hw.queues; i++) { if (local->queue_stop_reasons[i]) continue; spin_unlock_irqrestore(&local->queue_stop_reason_lock, *flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { int ac; for (ac = 0; ac < n_acs; ac++) { int ac_queue = sdata->vif.hw_queue[ac]; if (ac_queue == i || sdata->vif.cab_queue == i) __ieee80211_wake_txqs(sdata, ac); } } spin_lock_irqsave(&local->queue_stop_reason_lock, *flags); } rcu_read_unlock(); } void ieee80211_wake_txqs(struct tasklet_struct *t) { struct ieee80211_local *local = from_tasklet(local, t, wake_txqs_tasklet); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); _ieee80211_wake_txqs(local, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted, unsigned long *flags) { struct ieee80211_local *local = hw_to_local(hw); if (WARN_ON(queue >= hw->queues)) return; if (!test_bit(reason, &local->queue_stop_reasons[queue])) return; if (!refcounted) { local->q_stop_reasons[queue][reason] = 0; } else { local->q_stop_reasons[queue][reason]--; if (WARN_ON(local->q_stop_reasons[queue][reason] < 0)) local->q_stop_reasons[queue][reason] = 0; } if (local->q_stop_reasons[queue][reason] == 0) __clear_bit(reason, &local->queue_stop_reasons[queue]); trace_wake_queue(local, queue, reason, local->q_stop_reasons[queue][reason]); if (local->queue_stop_reasons[queue] != 0) /* someone still has this queue stopped */ return; if (!skb_queue_empty(&local->pending[queue])) tasklet_schedule(&local->tx_pending_tasklet); /* * Calling _ieee80211_wake_txqs here can be a problem because it may * release queue_stop_reason_lock which has been taken by * __ieee80211_wake_queue's caller. It is certainly not very nice to * release someone's lock, but it is fine because all the callers of * __ieee80211_wake_queue call it right before releasing the lock. */ if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER) tasklet_schedule(&local->wake_txqs_tasklet); else _ieee80211_wake_txqs(local, flags); } void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_wake_queue(hw, queue, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue) { ieee80211_wake_queue_by_reason(hw, queue, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_wake_queue); static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); if (WARN_ON(queue >= hw->queues)) return; if (!refcounted) local->q_stop_reasons[queue][reason] = 1; else local->q_stop_reasons[queue][reason]++; trace_stop_queue(local, queue, reason, local->q_stop_reasons[queue][reason]); set_bit(reason, &local->queue_stop_reasons[queue]); } void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_stop_queue(hw, queue, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue) { ieee80211_stop_queue_by_reason(hw, queue, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_stop_queue); void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_hw *hw = &local->hw; unsigned long flags; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int queue = info->hw_queue; if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); return; } spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false); __skb_queue_tail(&local->pending[queue], skb); __ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_add_pending_skbs(struct ieee80211_local *local, struct sk_buff_head *skbs) { struct ieee80211_hw *hw = &local->hw; struct sk_buff *skb; unsigned long flags; int queue, i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); while ((skb = skb_dequeue(skbs))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); continue; } queue = info->hw_queue; __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false); __skb_queue_tail(&local->pending[queue], skb); } for (i = 0; i < hw->queues; i++) __ieee80211_wake_queue(hw, i, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) __ieee80211_stop_queue(hw, i, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queues(struct ieee80211_hw *hw) { ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_stop_queues); int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int ret; if (WARN_ON(queue >= hw->queues)) return true; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); ret = test_bit(IEEE80211_QUEUE_STOP_REASON_DRIVER, &local->queue_stop_reasons[queue]); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return ret; } EXPORT_SYMBOL(ieee80211_queue_stopped); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) __ieee80211_wake_queue(hw, i, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_wake_queues(struct ieee80211_hw *hw) { ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_wake_queues); unsigned int ieee80211_get_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { unsigned int queues; if (sdata && ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { int ac; queues = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) if (sdata->vif.hw_queue[ac] != IEEE80211_INVAL_HW_QUEUE) queues |= BIT(sdata->vif.hw_queue[ac]); if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE) queues |= BIT(sdata->vif.cab_queue); } else { /* all queues */ queues = BIT(local->hw.queues) - 1; } return queues; } void __ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int queues, bool drop) { if (!local->ops->flush) return; /* * If no queue was set, or if the HW doesn't support * IEEE80211_HW_QUEUE_CONTROL - flush all queues */ if (!queues || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) queues = ieee80211_get_vif_queues(local, sdata); ieee80211_stop_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, false); if (drop) { struct sta_info *sta; /* Purge the queues, so the frames on them won't be * sent during __ieee80211_wake_queue() */ list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; ieee80211_purge_sta_txqs(sta); } } drv_flush(local, sdata, queues, drop); ieee80211_wake_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, false); } void ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool drop) { __ieee80211_flush_queues(local, sdata, 0, drop); } static void __iterate_interfaces(struct ieee80211_local *local, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_sub_if_data *sdata; bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE; list_for_each_entry_rcu(sdata, &local->interfaces, list, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)) { switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; break; case NL80211_IFTYPE_AP_VLAN: continue; default: break; } if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) && active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if ((iter_flags & IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if (ieee80211_sdata_running(sdata) || !active_only) iterator(data, sdata->vif.addr, &sdata->vif); } sdata = rcu_dereference_check(local->monitor_sdata, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || sdata->flags & IEEE80211_SDATA_IN_DRIVER)) iterator(data, sdata->vif.addr, &sdata->vif); } void ieee80211_iterate_interfaces( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); mutex_lock(&local->iflist_mtx); __iterate_interfaces(local, iter_flags, iterator, data); mutex_unlock(&local->iflist_mtx); } EXPORT_SYMBOL_GPL(ieee80211_iterate_interfaces); void ieee80211_iterate_active_interfaces_atomic( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); rcu_read_lock(); __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); void ieee80211_iterate_active_interfaces_mtx( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); lockdep_assert_wiphy(hw->wiphy); __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_mtx); static void __iterate_stations(struct ieee80211_local *local, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct sta_info *sta; list_for_each_entry_rcu(sta, &local->sta_list, list, lockdep_is_held(&local->hw.wiphy->mtx)) { if (!sta->uploaded) continue; iterator(data, &sta->sta); } } void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct ieee80211_local *local = hw_to_local(hw); rcu_read_lock(); __iterate_stations(local, iterator, data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_atomic); void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct ieee80211_local *local = hw_to_local(hw); lockdep_assert_wiphy(local->hw.wiphy); __iterate_stations(local, iterator, data); } EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_mtx); struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (!ieee80211_sdata_running(sdata) || !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) return NULL; return &sdata->vif; } EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif); struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif) { if (!vif) return NULL; return &vif_to_sdata(vif)->wdev; } EXPORT_SYMBOL_GPL(ieee80211_vif_to_wdev); /* * Nothing should have been stuffed into the workqueue during * the suspend->resume cycle. Since we can't check each caller * of this function if we are already quiescing / suspended, * check here and don't WARN since this can actually happen when * the rx path (for example) is racing against __ieee80211_suspend * and suspending / quiescing was set after the rx path checked * them. */ static bool ieee80211_can_queue_work(struct ieee80211_local *local) { if (local->quiescing || (local->suspended && !local->resuming)) { pr_warn("queueing ieee80211 work while going to suspend\n"); return false; } return true; } void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work) { struct ieee80211_local *local = hw_to_local(hw); if (!ieee80211_can_queue_work(local)) return; queue_work(local->workqueue, work); } EXPORT_SYMBOL(ieee80211_queue_work); void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, struct delayed_work *dwork, unsigned long delay) { struct ieee80211_local *local = hw_to_local(hw); if (!ieee80211_can_queue_work(local)) return; queue_delayed_work(local->workqueue, dwork, delay); } EXPORT_SYMBOL(ieee80211_queue_delayed_work); void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_queue_params *qparam, int ac) { struct ieee80211_chanctx_conf *chanctx_conf; const struct ieee80211_reg_rule *rrule; const struct ieee80211_wmm_ac *wmm_ac; u16 center_freq = 0; if (sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_STATION) return; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (chanctx_conf) center_freq = chanctx_conf->def.chan->center_freq; if (!center_freq) { rcu_read_unlock(); return; } rrule = freq_reg_info(sdata->wdev.wiphy, MHZ_TO_KHZ(center_freq)); if (IS_ERR_OR_NULL(rrule) || !rrule->has_wmm) { rcu_read_unlock(); return; } if (sdata->vif.type == NL80211_IFTYPE_AP) wmm_ac = &rrule->wmm_rule.ap[ac]; else wmm_ac = &rrule->wmm_rule.client[ac]; qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min); qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max); qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn); qparam->txop = min_t(u16, qparam->txop, wmm_ac->cot / 32); rcu_read_unlock(); } void ieee80211_set_wmm_default(struct ieee80211_link_data *link, bool bss_notify, bool enable_qos) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_tx_queue_params qparam; struct ieee80211_chanctx_conf *chanctx_conf; int ac; bool use_11b; bool is_ocb; /* Use another EDCA parameters if dot11OCBActivated=true */ int aCWmin, aCWmax; if (!local->ops->conf_tx) return; if (local->hw.queues < IEEE80211_NUM_ACS) return; memset(&qparam, 0, sizeof(qparam)); rcu_read_lock(); chanctx_conf = rcu_dereference(link->conf->chanctx_conf); use_11b = (chanctx_conf && chanctx_conf->def.chan->band == NL80211_BAND_2GHZ) && !link->operating_11g_mode; rcu_read_unlock(); is_ocb = (sdata->vif.type == NL80211_IFTYPE_OCB); /* Set defaults according to 802.11-2007 Table 7-37 */ aCWmax = 1023; if (use_11b) aCWmin = 31; else aCWmin = 15; /* Configure old 802.11b/g medium access rules. */ qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; qparam.aifs = 2; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { /* Update if QoS is enabled. */ if (enable_qos) { switch (ac) { case IEEE80211_AC_BK: qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; if (is_ocb) qparam.aifs = 9; else qparam.aifs = 7; break; /* never happens but let's not leave undefined */ default: case IEEE80211_AC_BE: qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; if (is_ocb) qparam.aifs = 6; else qparam.aifs = 3; break; case IEEE80211_AC_VI: qparam.cw_max = aCWmin; qparam.cw_min = (aCWmin + 1) / 2 - 1; if (is_ocb) qparam.txop = 0; else if (use_11b) qparam.txop = 6016/32; else qparam.txop = 3008/32; if (is_ocb) qparam.aifs = 3; else qparam.aifs = 2; break; case IEEE80211_AC_VO: qparam.cw_max = (aCWmin + 1) / 2 - 1; qparam.cw_min = (aCWmin + 1) / 4 - 1; if (is_ocb) qparam.txop = 0; else if (use_11b) qparam.txop = 3264/32; else qparam.txop = 1504/32; qparam.aifs = 2; break; } } ieee80211_regulatory_limit_wmm_params(sdata, &qparam, ac); qparam.uapsd = false; link->tx_conf[ac] = qparam; drv_conf_tx(local, link, ac, &qparam); } if (sdata->vif.type != NL80211_IFTYPE_MONITOR && sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_NAN) { link->conf->qos = enable_qos; if (bss_notify) ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); } } void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *da, const u8 *bssid, const u8 *key, u8 key_len, u8 key_idx, u32 tx_flags) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; bool multi_link = ieee80211_vif_is_mld(&sdata->vif); struct { u8 id; u8 len; u8 ext_id; struct ieee80211_multi_link_elem ml; struct ieee80211_mle_basic_common_info basic; } __packed mle = { .id = WLAN_EID_EXTENSION, .len = sizeof(mle) - 2, .ext_id = WLAN_EID_EXT_EHT_MULTI_LINK, .ml.control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC), .basic.len = sizeof(mle.basic), }; int err; memcpy(mle.basic.mld_mac_addr, sdata->vif.addr, ETH_ALEN); /* 24 + 6 = header + auth_algo + auth_transaction + status_code */ skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN + 24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN + multi_link * sizeof(mle)); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN); mgmt = skb_put_zero(skb, 24 + 6); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_AUTH); memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, bssid, ETH_ALEN); mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg); mgmt->u.auth.auth_transaction = cpu_to_le16(transaction); mgmt->u.auth.status_code = cpu_to_le16(status); if (extra) skb_put_data(skb, extra, extra_len); if (multi_link) skb_put_data(skb, &mle, sizeof(mle)); if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) { mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); err = ieee80211_wep_encrypt(local, skb, key, key_len, key_idx); if (WARN_ON(err)) { kfree_skb(skb); return; } } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | tx_flags; ieee80211_tx_skb(sdata, skb); } void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt = (void *)frame_buf; /* build frame */ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype); mgmt->duration = 0; /* initialize only */ mgmt->seq_ctrl = 0; /* initialize only */ memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, bssid, ETH_ALEN); /* u.deauth.reason_code == u.disassoc.reason_code */ mgmt->u.deauth.reason_code = cpu_to_le16(reason); if (send_frame) { skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_DEAUTH_FRAME_LEN); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); /* copy in frame */ skb_put_data(skb, mgmt, IEEE80211_DEAUTH_FRAME_LEN); if (sdata->vif.type != NL80211_IFTYPE_STATION || !(sdata->u.mgd.flags & IEEE80211_STA_MFP_ENABLED)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; ieee80211_tx_skb(sdata, skb); } } static int ieee80211_put_s1g_cap(struct sk_buff *skb, struct ieee80211_sta_s1g_cap *s1g_cap) { if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_s1g_cap)) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_S1G_CAPABILITIES); skb_put_u8(skb, sizeof(struct ieee80211_s1g_cap)); skb_put_data(skb, &s1g_cap->cap, sizeof(s1g_cap->cap)); skb_put_data(skb, &s1g_cap->nss_mcs, sizeof(s1g_cap->nss_mcs)); return 0; } static int ieee80211_put_preq_ies_band(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const u8 *ie, size_t ie_len, size_t *offset, enum nl80211_band band, u32 rate_mask, struct cfg80211_chan_def *chandef, u32 flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; int i, err; size_t noffset; u32 rate_flags; bool have_80mhz = false; *offset = 0; sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband)) return 0; rate_flags = ieee80211_chandef_rate_flags(chandef); /* For direct scan add S1G IE and consider its override bits */ if (band == NL80211_BAND_S1GHZ) return ieee80211_put_s1g_cap(skb, &sband->s1g_cap); err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, ~rate_mask, WLAN_EID_SUPP_RATES); if (err) return err; /* insert "request information" if in custom IEs */ if (ie && ie_len) { static const u8 before_extrates[] = { WLAN_EID_SSID, WLAN_EID_SUPP_RATES, WLAN_EID_REQUEST, }; noffset = ieee80211_ie_split(ie, ie_len, before_extrates, ARRAY_SIZE(before_extrates), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, ~rate_mask, WLAN_EID_EXT_SUPP_RATES); if (err) return err; if (chandef->chan && sband->band == NL80211_BAND_2GHZ) { if (skb_tailroom(skb) < 3) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_DS_PARAMS); skb_put_u8(skb, 1); skb_put_u8(skb, ieee80211_frequency_to_channel(chandef->chan->center_freq)); } if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT) return 0; /* insert custom IEs that go before HT */ if (ie && ie_len) { static const u8 before_ht[] = { /* * no need to list the ones split off already * (or generated here) */ WLAN_EID_DS_PARAMS, WLAN_EID_SUPPORTED_REGULATORY_CLASSES, }; noffset = ieee80211_ie_split(ie, ie_len, before_ht, ARRAY_SIZE(before_ht), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } if (sband->ht_cap.ht_supported) { u8 *pos; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_cap)) return -ENOBUFS; pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_cap)); ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, sband->ht_cap.cap); } /* insert custom IEs that go before VHT */ if (ie && ie_len) { static const u8 before_vht[] = { /* * no need to list the ones split off already * (or generated here) */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_SSID_LIST, WLAN_EID_CHANNEL_USAGE, WLAN_EID_INTERWORKING, WLAN_EID_MESH_ID, /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; noffset = ieee80211_ie_split(ie, ie_len, before_vht, ARRAY_SIZE(before_vht), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } /* Check if any channel in this sband supports at least 80 MHz */ for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (sband->vht_cap.vht_supported && have_80mhz) { u8 *pos; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_cap)) return -ENOBUFS; pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_cap)); ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, sband->vht_cap.cap); } /* insert custom IEs that go before HE */ if (ie && ie_len) { static const u8 before_he[] = { /* * no need to list the ones split off before VHT * or generated here */ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS, WLAN_EID_AP_CSN, /* TODO: add 11ah/11aj/11ak elements */ }; noffset = ieee80211_ie_split(ie, ie_len, before_he, ARRAY_SIZE(before_he), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_HE)) { err = ieee80211_put_he_cap(skb, sdata, sband, NULL); if (err) return err; } if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_HE | IEEE80211_CHAN_NO_EHT)) { err = ieee80211_put_eht_cap(skb, sdata, sband, NULL); if (err) return err; } err = ieee80211_put_he_6ghz_cap(skb, sdata, IEEE80211_SMPS_OFF); if (err) return err; /* * If adding more here, adjust code in main.c * that calculates local->scan_ies_len. */ return 0; } static int ieee80211_put_preq_ies(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags) { size_t custom_ie_offset = 0; int i, err; memset(ie_desc, 0, sizeof(*ie_desc)); for (i = 0; i < NUM_NL80211_BANDS; i++) { if (bands_used & BIT(i)) { ie_desc->ies[i] = skb_tail_pointer(skb); err = ieee80211_put_preq_ies_band(skb, sdata, ie, ie_len, &custom_ie_offset, i, rate_masks[i], chandef, flags); if (err) return err; ie_desc->len[i] = skb_tail_pointer(skb) - ie_desc->ies[i]; } } /* add any remaining custom IEs */ if (ie && ie_len) { if (WARN_ONCE(skb_tailroom(skb) < ie_len - custom_ie_offset, "not enough space for preq custom IEs\n")) return -ENOBUFS; ie_desc->common_ies = skb_tail_pointer(skb); skb_put_data(skb, ie + custom_ie_offset, ie_len - custom_ie_offset); ie_desc->common_ie_len = skb_tail_pointer(skb) - ie_desc->common_ies; } return 0; }; int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags) { struct sk_buff *skb = alloc_skb(buffer_len, GFP_KERNEL); uintptr_t offs; int ret, i; u8 *start; if (!skb) return -ENOMEM; start = skb_tail_pointer(skb); memset(start, 0, skb_tailroom(skb)); ret = ieee80211_put_preq_ies(skb, sdata, ie_desc, ie, ie_len, bands_used, rate_masks, chandef, flags); if (ret < 0) { goto out; } if (skb->len > buffer_len) { ret = -ENOBUFS; goto out; } memcpy(buffer, start, skb->len); /* adjust ie_desc for copy */ for (i = 0; i < NUM_NL80211_BANDS; i++) { offs = ie_desc->ies[i] - start; ie_desc->ies[i] = buffer + offs; } offs = ie_desc->common_ies - start; ie_desc->common_ies = buffer + offs; ret = skb->len; out: consume_skb(skb); return ret; } struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 flags) { struct ieee80211_local *local = sdata->local; struct cfg80211_chan_def chandef; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u32 rate_masks[NUM_NL80211_BANDS] = {}; struct ieee80211_scan_ies dummy_ie_desc; /* * Do not send DS Channel parameter for directed probe requests * in order to maximize the chance that we get a response. Some * badly-behaved APs don't respond when this parameter is included. */ chandef.width = sdata->vif.bss_conf.chanreq.oper.width; if (flags & IEEE80211_PROBE_FLAG_DIRECTED) chandef.chan = NULL; else chandef.chan = chan; skb = ieee80211_probereq_get(&local->hw, src, ssid, ssid_len, local->scan_ies_len + ie_len); if (!skb) return NULL; rate_masks[chan->band] = ratemask; ieee80211_put_preq_ies(skb, sdata, &dummy_ie_desc, ie, ie_len, BIT(chan->band), rate_masks, &chandef, flags); if (dst) { mgmt = (struct ieee80211_mgmt *) skb->data; memcpy(mgmt->da, dst, ETH_ALEN); memcpy(mgmt->bssid, dst, ETH_ALEN); } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; return skb; } u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates) { struct ieee80211_supported_band *sband; size_t num_rates; u32 supp_rates, rate_flags; int i, j; sband = sdata->local->hw.wiphy->bands[band]; if (WARN_ON(!sband)) return 1; rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); num_rates = sband->n_bitrates; supp_rates = 0; for (i = 0; i < elems->supp_rates_len + elems->ext_supp_rates_len; i++) { u8 rate = 0; int own_rate; bool is_basic; if (i < elems->supp_rates_len) rate = elems->supp_rates[i]; else if (elems->ext_supp_rates) rate = elems->ext_supp_rates [i - elems->supp_rates_len]; own_rate = 5 * (rate & 0x7f); is_basic = !!(rate & 0x80); if (is_basic && (rate & 0x7f) == BSS_MEMBERSHIP_SELECTOR_HT_PHY) continue; for (j = 0; j < num_rates; j++) { int brate; if ((rate_flags & sband->bitrates[j].flags) != rate_flags) continue; brate = sband->bitrates[j].bitrate; if (brate == own_rate) { supp_rates |= BIT(j); if (basic_rates && is_basic) *basic_rates |= BIT(j); } } } return supp_rates; } void ieee80211_stop_device(struct ieee80211_local *local, bool suspend) { local_bh_disable(); ieee80211_handle_queued_frames(local); local_bh_enable(); ieee80211_led_radio(local, false); ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO); wiphy_work_cancel(local->hw.wiphy, &local->reconfig_filter); flush_workqueue(local->workqueue); wiphy_work_flush(local->hw.wiphy, NULL); drv_stop(local, suspend); } static void ieee80211_flush_completed_scan(struct ieee80211_local *local, bool aborted) { /* It's possible that we don't handle the scan completion in * time during suspend, so if it's still marked as completed * here, queue the work and flush it to clean things up. * Instead of calling the worker function directly here, we * really queue it to avoid potential races with other flows * scheduling the same work. */ if (test_bit(SCAN_COMPLETED, &local->scanning)) { /* If coming from reconfiguration failure, abort the scan so * we don't attempt to continue a partial HW scan - which is * possible otherwise if (e.g.) the 2.4 GHz portion was the * completed scan, and a 5 GHz portion is still pending. */ if (aborted) set_bit(SCAN_ABORTED, &local->scanning); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); wiphy_delayed_work_flush(local->hw.wiphy, &local->scan_work); } } static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); /* * We get here if during resume the device can't be restarted properly. * We might also get here if this happens during HW reset, which is a * slightly different situation and we need to drop all connections in * the latter case. * * Ask cfg80211 to turn off all interfaces, this will result in more * warnings but at least we'll then get into a clean stopped state. */ local->resuming = false; local->suspended = false; local->in_reconfig = false; local->reconfig_failure = true; ieee80211_flush_completed_scan(local, true); /* scheduled scan clearly can't be running any more, but tell * cfg80211 and clear local state */ ieee80211_sched_scan_end(local); list_for_each_entry(sdata, &local->interfaces, list) sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; /* Mark channel contexts as not being in the driver any more to avoid * removing them from the driver during the shutdown process... */ list_for_each_entry(ctx, &local->chanctx_list, list) ctx->driver_present = false; } static void ieee80211_assign_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link) { struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); if (conf) { ctx = container_of(conf, struct ieee80211_chanctx, conf); drv_assign_vif_chanctx(local, sdata, link->conf, ctx); } } static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; lockdep_assert_wiphy(local->hw.wiphy); /* add STAs back */ list_for_each_entry(sta, &local->sta_list, list) { enum ieee80211_sta_state state; if (!sta->uploaded || sta->sdata != sdata) continue; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) WARN_ON(drv_sta_state(local, sta->sdata, sta, state, state + 1)); } } static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata) { struct cfg80211_nan_func *func, **funcs; int res, id, i = 0; res = drv_start_nan(sdata->local, sdata, &sdata->u.nan.conf); if (WARN_ON(res)) return res; funcs = kcalloc(sdata->local->hw.max_nan_de_entries + 1, sizeof(*funcs), GFP_KERNEL); if (!funcs) return -ENOMEM; /* Add all the functions: * This is a little bit ugly. We need to call a potentially sleeping * callback for each NAN function, so we can't hold the spinlock. */ spin_lock_bh(&sdata->u.nan.func_lock); idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) funcs[i++] = func; spin_unlock_bh(&sdata->u.nan.func_lock); for (i = 0; funcs[i]; i++) { res = drv_add_nan_func(sdata->local, sdata, funcs[i]); if (WARN_ON(res)) ieee80211_nan_func_terminated(&sdata->vif, funcs[i]->instance_id, NL80211_NAN_FUNC_TERM_REASON_ERROR, GFP_KERNEL); } kfree(funcs); return 0; } static void ieee80211_reconfig_ap_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 changed) { int link_id; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; if (!(sdata->vif.active_links & BIT(link_id))) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; if (rcu_access_pointer(link->u.ap.beacon)) drv_start_ap(local, sdata, link->conf); if (!link->conf->enable_beacon) continue; changed |= BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED; ieee80211_link_info_change_notify(sdata, link, changed); } } int ieee80211_reconfig(struct ieee80211_local *local) { struct ieee80211_hw *hw = &local->hw; struct ieee80211_sub_if_data *sdata; struct ieee80211_chanctx *ctx; struct sta_info *sta; int res, i; bool reconfig_due_to_wowlan = false; struct ieee80211_sub_if_data *sched_scan_sdata; struct cfg80211_sched_scan_request *sched_scan_req; bool sched_scan_stopped = false; bool suspended = local->suspended; bool in_reconfig = false; lockdep_assert_wiphy(local->hw.wiphy); /* nothing to do if HW shouldn't run */ if (!local->open_count) goto wake_up; #ifdef CONFIG_PM if (suspended) local->resuming = true; if (local->wowlan) { /* * In the wowlan case, both mac80211 and the device * are functional when the resume op is called, so * clear local->suspended so the device could operate * normally (e.g. pass rx frames). */ local->suspended = false; res = drv_resume(local); local->wowlan = false; if (res < 0) { local->resuming = false; return res; } if (res == 0) goto wake_up; WARN_ON(res > 1); /* * res is 1, which means the driver requested * to go through a regular reset on wakeup. * restore local->suspended in this case. */ reconfig_due_to_wowlan = true; local->suspended = true; } #endif /* * In case of hw_restart during suspend (without wowlan), * cancel restart work, as we are reconfiguring the device * anyway. * Note that restart_work is scheduled on a frozen workqueue, * so we can't deadlock in this case. */ if (suspended && local->in_reconfig && !reconfig_due_to_wowlan) cancel_work_sync(&local->restart_work); local->started = false; /* * Upon resume hardware can sometimes be goofy due to * various platform / driver / bus issues, so restarting * the device may at times not work immediately. Propagate * the error. */ res = drv_start(local); if (res) { if (suspended) WARN(1, "Hardware became unavailable upon resume. This could be a software issue prior to suspend or a hardware issue.\n"); else WARN(1, "Hardware became unavailable during restart.\n"); ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); ieee80211_handle_reconfig_failure(local); return res; } /* setup fragmentation threshold */ drv_set_frag_threshold(local, hw->wiphy->frag_threshold); /* setup RTS threshold */ drv_set_rts_threshold(local, hw->wiphy->rts_threshold); /* reset coverage class */ drv_set_coverage_class(local, hw->wiphy->coverage_class); ieee80211_led_radio(local, true); ieee80211_mod_tpt_led_trig(local, IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); /* add interfaces */ sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { /* in HW restart it exists already */ WARN_ON(local->resuming); res = drv_add_interface(local, sdata); if (WARN_ON(res)) { RCU_INIT_POINTER(local->monitor_sdata, NULL); synchronize_net(); kfree(sdata); } } list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && ieee80211_sdata_running(sdata)) { res = drv_add_interface(local, sdata); if (WARN_ON(res)) break; } } /* If adding any of the interfaces failed above, roll back and * report failure. */ if (res) { list_for_each_entry_continue_reverse(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && ieee80211_sdata_running(sdata)) drv_remove_interface(local, sdata); } ieee80211_handle_reconfig_failure(local); return res; } /* add channel contexts */ list_for_each_entry(ctx, &local->chanctx_list, list) if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) WARN_ON(drv_add_chanctx(local, ctx)); sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_sdata_running(sdata)) ieee80211_assign_chanctx(local, sdata, &sdata->deflink); /* reconfigure hardware */ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL | IEEE80211_CONF_CHANGE_MONITOR | IEEE80211_CONF_CHANGE_PS | IEEE80211_CONF_CHANGE_RETRY_LIMITS | IEEE80211_CONF_CHANGE_IDLE); ieee80211_configure_filter(local); /* Finally also reconfigure all the BSS information */ list_for_each_entry(sdata, &local->interfaces, list) { /* common change flags for all interface types - link only */ u64 changed = BSS_CHANGED_ERP_CTS_PROT | BSS_CHANGED_ERP_PREAMBLE | BSS_CHANGED_ERP_SLOT | BSS_CHANGED_HT | BSS_CHANGED_BASIC_RATES | BSS_CHANGED_BEACON_INT | BSS_CHANGED_BSSID | BSS_CHANGED_CQM | BSS_CHANGED_QOS | BSS_CHANGED_TXPOWER | BSS_CHANGED_MCAST_RATE; struct ieee80211_link_data *link = NULL; unsigned int link_id; u32 active_links = 0; if (!ieee80211_sdata_running(sdata)) continue; if (ieee80211_vif_is_mld(&sdata->vif)) { struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS] = { [0] = &sdata->vif.bss_conf, }; if (sdata->vif.type == NL80211_IFTYPE_STATION) { /* start with a single active link */ active_links = sdata->vif.active_links; link_id = ffs(active_links) - 1; sdata->vif.active_links = BIT(link_id); } drv_change_vif_links(local, sdata, 0, sdata->vif.active_links, old); } sdata->restart_active_links = active_links; for (link_id = 0; link_id < ARRAY_SIZE(sdata->vif.link_conf); link_id++) { if (!ieee80211_vif_link_active(&sdata->vif, link_id)) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; ieee80211_assign_chanctx(local, sdata, link); } switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: break; case NL80211_IFTYPE_ADHOC: if (sdata->vif.cfg.ibss_joined) WARN_ON(drv_join_ibss(local, sdata)); fallthrough; default: ieee80211_reconfig_stations(sdata); fallthrough; case NL80211_IFTYPE_AP: /* AP stations are handled later */ for (i = 0; i < IEEE80211_NUM_ACS; i++) drv_conf_tx(local, &sdata->deflink, i, &sdata->deflink.tx_conf[i]); break; } if (sdata->vif.bss_conf.mu_mimo_owner) changed |= BSS_CHANGED_MU_GROUPS; if (!ieee80211_vif_is_mld(&sdata->vif)) changed |= BSS_CHANGED_IDLE; switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (!ieee80211_vif_is_mld(&sdata->vif)) { changed |= BSS_CHANGED_ASSOC | BSS_CHANGED_ARP_FILTER | BSS_CHANGED_PS; /* Re-send beacon info report to the driver */ if (sdata->deflink.u.mgd.have_beacon) changed |= BSS_CHANGED_BEACON_INFO; if (sdata->vif.bss_conf.max_idle_period || sdata->vif.bss_conf.protected_keep_alive) changed |= BSS_CHANGED_KEEP_ALIVE; ieee80211_bss_info_change_notify(sdata, changed); } else if (!WARN_ON(!link)) { ieee80211_link_info_change_notify(sdata, link, changed); changed = BSS_CHANGED_ASSOC | BSS_CHANGED_IDLE | BSS_CHANGED_PS | BSS_CHANGED_ARP_FILTER; ieee80211_vif_cfg_change_notify(sdata, changed); } break; case NL80211_IFTYPE_OCB: changed |= BSS_CHANGED_OCB; ieee80211_bss_info_change_notify(sdata, changed); break; case NL80211_IFTYPE_ADHOC: changed |= BSS_CHANGED_IBSS; fallthrough; case NL80211_IFTYPE_AP: changed |= BSS_CHANGED_P2P_PS; if (ieee80211_vif_is_mld(&sdata->vif)) ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID); else changed |= BSS_CHANGED_SSID; if (sdata->vif.bss_conf.ftm_responder == 1 && wiphy_ext_feature_isset(sdata->local->hw.wiphy, NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER)) changed |= BSS_CHANGED_FTM_RESPONDER; if (sdata->vif.type == NL80211_IFTYPE_AP) { changed |= BSS_CHANGED_AP_PROBE_RESP; if (ieee80211_vif_is_mld(&sdata->vif)) { ieee80211_reconfig_ap_links(local, sdata, changed); break; } if (rcu_access_pointer(sdata->deflink.u.ap.beacon)) drv_start_ap(local, sdata, sdata->deflink.conf); } fallthrough; case NL80211_IFTYPE_MESH_POINT: if (sdata->vif.bss_conf.enable_beacon) { changed |= BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED; ieee80211_bss_info_change_notify(sdata, changed); } break; case NL80211_IFTYPE_NAN: res = ieee80211_reconfig_nan(sdata); if (res < 0) { ieee80211_handle_reconfig_failure(local); return res; } break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_WDS: WARN_ON(1); break; } } ieee80211_recalc_ps(local); /* * The sta might be in psm against the ap (e.g. because * this was the state before a hw restart), so we * explicitly send a null packet in order to make sure * it'll sync against the ap (and get out of psm). */ if (!(local->hw.conf.flags & IEEE80211_CONF_PS)) { list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_STATION) continue; if (!sdata->u.mgd.associated) continue; ieee80211_send_nullfunc(local, sdata, false); } } /* APs are now beaconing, add back stations */ list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_AP: ieee80211_reconfig_stations(sdata); break; default: break; } } /* add back keys */ list_for_each_entry(sdata, &local->interfaces, list) ieee80211_reenable_keys(sdata); /* re-enable multi-link for client interfaces */ list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->restart_active_links) ieee80211_set_active_links(&sdata->vif, sdata->restart_active_links); /* * If a link switch was scheduled before the restart, and ran * before reconfig, it will do nothing, so re-schedule. */ if (sdata->desired_active_links) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->activate_links_work); } /* Reconfigure sched scan if it was interrupted by FW restart */ sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); sched_scan_req = rcu_dereference_protected(local->sched_scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); if (sched_scan_sdata && sched_scan_req) /* * Sched scan stopped, but we don't want to report it. Instead, * we're trying to reschedule. However, if more than one scan * plan was set, we cannot reschedule since we don't know which * scan plan was currently running (and some scan plans may have * already finished). */ if (sched_scan_req->n_scan_plans > 1 || __ieee80211_request_sched_scan_start(sched_scan_sdata, sched_scan_req)) { RCU_INIT_POINTER(local->sched_scan_sdata, NULL); RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_stopped = true; } if (sched_scan_stopped) cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); wake_up: if (local->monitors == local->open_count && local->monitors > 0) ieee80211_add_virtual_monitor(local); /* * Clear the WLAN_STA_BLOCK_BA flag so new aggregation * sessions can be established after a resume. * * Also tear down aggregation sessions since reconfiguring * them in a hardware restart scenario is not easily done * right now, and the hardware will have lost information * about the sessions, but we and the AP still think they * are active. This is really a workaround though. */ if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { list_for_each_entry(sta, &local->sta_list, list) { if (!local->resuming) ieee80211_sta_tear_down_BA_sessions( sta, AGG_STOP_LOCAL_REQUEST); clear_sta_flag(sta, WLAN_STA_BLOCK_BA); } } /* * If this is for hw restart things are still running. * We may want to change that later, however. */ if (local->open_count && (!suspended || reconfig_due_to_wowlan)) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_RESTART); if (local->in_reconfig) { in_reconfig = local->in_reconfig; local->in_reconfig = false; barrier(); ieee80211_reconfig_roc(local); /* Requeue all works */ list_for_each_entry(sdata, &local->interfaces, list) wiphy_work_queue(local->hw.wiphy, &sdata->work); } ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); if (in_reconfig) { list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_restart(sdata); } } if (!suspended) return 0; #ifdef CONFIG_PM /* first set suspended false, then resuming */ local->suspended = false; mb(); local->resuming = false; ieee80211_flush_completed_scan(local, false); if (local->open_count && !reconfig_due_to_wowlan) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_restart(sdata); } mod_timer(&local->sta_cleanup, jiffies + 1); #else WARN_ON(1); #endif return 0; } static void ieee80211_reconfig_disconnect(struct ieee80211_vif *vif, u8 flag) { struct ieee80211_sub_if_data *sdata; struct ieee80211_local *local; struct ieee80211_key *key; if (WARN_ON(!vif)) return; sdata = vif_to_sdata(vif); local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_RESUME && !local->resuming)) return; if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_HW_RESTART && !local->in_reconfig)) return; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return; sdata->flags |= flag; list_for_each_entry(key, &sdata->key_list, list) key->flags |= KEY_FLAG_TAINTED; } void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif) { ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_HW_RESTART); } EXPORT_SYMBOL_GPL(ieee80211_hw_restart_disconnect); void ieee80211_resume_disconnect(struct ieee80211_vif *vif) { ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_RESUME); } EXPORT_SYMBOL_GPL(ieee80211_resume_disconnect); void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_chanctx *chanctx; lockdep_assert_wiphy(local->hw.wiphy); chanctx_conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); /* * This function can be called from a work, thus it may be possible * that the chanctx_conf is removed (due to a disconnection, for * example). * So nothing should be done in such case. */ if (!chanctx_conf) return; chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); ieee80211_recalc_smps_chanctx(local, chanctx); } void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata, int link_id) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_chanctx *chanctx; int i; lockdep_assert_wiphy(local->hw.wiphy); for (i = 0; i < ARRAY_SIZE(sdata->vif.link_conf); i++) { struct ieee80211_bss_conf *bss_conf; if (link_id >= 0 && link_id != i) continue; rcu_read_lock(); bss_conf = rcu_dereference(sdata->vif.link_conf[i]); if (!bss_conf) { rcu_read_unlock(); continue; } chanctx_conf = rcu_dereference_protected(bss_conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); /* * Since we hold the wiphy mutex (checked above) * we can take the chanctx_conf pointer out of the * RCU critical section, it cannot go away without * the mutex. Just the way we reached it could - in * theory - go away, but we don't really care and * it really shouldn't happen anyway. */ rcu_read_unlock(); if (!chanctx_conf) return; chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false); } } size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset) { size_t pos = offset; while (pos < ielen && ies[pos] != WLAN_EID_VENDOR_SPECIFIC) pos += 2 + ies[pos + 1]; return pos; } u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 cap) { __le16 tmp; *pos++ = WLAN_EID_HT_CAPABILITY; *pos++ = sizeof(struct ieee80211_ht_cap); memset(pos, 0, sizeof(struct ieee80211_ht_cap)); /* capability flags */ tmp = cpu_to_le16(cap); memcpy(pos, &tmp, sizeof(u16)); pos += sizeof(u16); /* AMPDU parameters */ *pos++ = ht_cap->ampdu_factor | (ht_cap->ampdu_density << IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT); /* MCS set */ memcpy(pos, &ht_cap->mcs, sizeof(ht_cap->mcs)); pos += sizeof(ht_cap->mcs); /* extended capabilities */ pos += sizeof(__le16); /* BF capabilities */ pos += sizeof(__le32); /* antenna selection */ pos += sizeof(u8); return pos; } u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap) { __le32 tmp; *pos++ = WLAN_EID_VHT_CAPABILITY; *pos++ = sizeof(struct ieee80211_vht_cap); memset(pos, 0, sizeof(struct ieee80211_vht_cap)); /* capability flags */ tmp = cpu_to_le32(cap); memcpy(pos, &tmp, sizeof(u32)); pos += sizeof(u32); /* VHT MCS set */ memcpy(pos, &vht_cap->vht_mcs, sizeof(vht_cap->vht_mcs)); pos += sizeof(vht_cap->vht_mcs); return pos; } /* this may return more than ieee80211_put_he_6ghz_cap() will need */ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata) { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_supported_band *sband; u8 n; sband = ieee80211_get_sband(sdata); if (!sband) return 0; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) return 0; n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem); return 2 + 1 + sizeof(he_cap->he_cap_elem) + n + ieee80211_he_ppe_size(he_cap->ppe_thres[0], he_cap->he_cap_elem.phy_cap_info); } static void ieee80211_get_adjusted_he_cap(const struct ieee80211_conn_settings *conn, const struct ieee80211_sta_he_cap *he_cap, struct ieee80211_he_cap_elem *elem) { u8 ru_limit, max_ru; *elem = he_cap->he_cap_elem; switch (conn->bw_limit) { case IEEE80211_CONN_BW_LIMIT_20: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242; break; case IEEE80211_CONN_BW_LIMIT_40: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484; break; case IEEE80211_CONN_BW_LIMIT_80: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996; break; default: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996; break; } max_ru = elem->phy_cap_info[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK; max_ru = min(max_ru, ru_limit); elem->phy_cap_info[8] &= ~IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK; elem->phy_cap_info[8] |= max_ru; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) { elem->phy_cap_info[0] &= ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G); elem->phy_cap_info[9] &= ~IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM; } if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) { elem->phy_cap_info[0] &= ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G); elem->phy_cap_info[5] &= ~IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK; elem->phy_cap_info[7] &= ~(IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ | IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ); } } int ieee80211_put_he_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn) { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_he_cap_elem elem; u8 *len; u8 n; u8 ie_len; if (!conn) conn = &ieee80211_conn_settings_unlimited; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) return 0; /* modify on stack first to calculate 'n' and 'ie_len' correctly */ ieee80211_get_adjusted_he_cap(conn, he_cap, &elem); n = ieee80211_he_mcs_nss_size(&elem); ie_len = 2 + 1 + sizeof(he_cap->he_cap_elem) + n + ieee80211_he_ppe_size(he_cap->ppe_thres[0], he_cap->he_cap_elem.phy_cap_info); if (skb_tailroom(skb) < ie_len) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); len = skb_put(skb, 1); /* We'll set the size later below */ skb_put_u8(skb, WLAN_EID_EXT_HE_CAPABILITY); /* Fixed data */ skb_put_data(skb, &elem, sizeof(elem)); skb_put_data(skb, &he_cap->he_mcs_nss_supp, n); /* Check if PPE Threshold should be present */ if ((he_cap->he_cap_elem.phy_cap_info[6] & IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) goto end; /* * Calculate how many PPET16/PPET8 pairs are to come. Algorithm: * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK) */ n = hweight8(he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >> IEEE80211_PPE_THRES_NSS_POS)); /* * Each pair is 6 bits, and we need to add the 7 "header" bits to the * total size. */ n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; n = DIV_ROUND_UP(n, 8); /* Copy PPE Thresholds */ skb_put_data(skb, &he_cap->ppe_thres, n); end: *len = skb_tail_pointer(skb) - len - 1; return 0; } int ieee80211_put_he_6ghz_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode) { struct ieee80211_supported_band *sband; const struct ieee80211_sband_iftype_data *iftd; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); __le16 cap; if (!cfg80211_any_usable_channels(sdata->local->hw.wiphy, BIT(NL80211_BAND_6GHZ), IEEE80211_CHAN_NO_HE)) return 0; sband = sdata->local->hw.wiphy->bands[NL80211_BAND_6GHZ]; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (!iftd) return 0; /* Check for device HE 6 GHz capability before adding element */ if (!iftd->he_6ghz_capa.capa) return 0; cap = iftd->he_6ghz_capa.capa; cap &= cpu_to_le16(~IEEE80211_HE_6GHZ_CAP_SM_PS); switch (smps_mode) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); fallthrough; case IEEE80211_SMPS_OFF: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_STATIC: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC, IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_DYNAMIC: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC, IEEE80211_HE_6GHZ_CAP_SM_PS); break; } if (skb_tailroom(skb) < 2 + 1 + sizeof(cap)) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); skb_put_u8(skb, 1 + sizeof(cap)); skb_put_u8(skb, WLAN_EID_EXT_HE_6GHZ_CAPA); skb_put_data(skb, &cap, sizeof(cap)); return 0; } u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) { struct ieee80211_ht_operation *ht_oper; /* Build HT Information */ *pos++ = WLAN_EID_HT_OPERATION; *pos++ = sizeof(struct ieee80211_ht_operation); ht_oper = (struct ieee80211_ht_operation *)pos; ht_oper->primary_chan = ieee80211_frequency_to_channel( chandef->chan->center_freq); switch (chandef->width) { case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_40: if (chandef->center_freq1 > chandef->chan->center_freq) ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_BELOW; break; case NL80211_CHAN_WIDTH_320: /* HT information element should not be included on 6GHz */ WARN_ON(1); return pos; default: ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_NONE; break; } if (ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 && chandef->width != NL80211_CHAN_WIDTH_20_NOHT && chandef->width != NL80211_CHAN_WIDTH_20) ht_oper->ht_param |= IEEE80211_HT_PARAM_CHAN_WIDTH_ANY; if (rifs_mode) ht_oper->ht_param |= IEEE80211_HT_PARAM_RIFS_MODE; ht_oper->operation_mode = cpu_to_le16(prot_mode); ht_oper->stbc_param = 0x0000; /* It seems that Basic MCS set and Supported MCS set are identical for the first 10 bytes */ memset(&ht_oper->basic_set, 0, 16); memcpy(&ht_oper->basic_set, &ht_cap->mcs, 10); return pos + sizeof(struct ieee80211_ht_operation); } void ieee80211_ie_build_wide_bw_cs(u8 *pos, const struct cfg80211_chan_def *chandef) { *pos++ = WLAN_EID_WIDE_BW_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ /* New channel width */ switch (chandef->width) { case NL80211_CHAN_WIDTH_80: *pos++ = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_160: *pos++ = IEEE80211_VHT_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80P80: *pos++ = IEEE80211_VHT_CHANWIDTH_80P80MHZ; break; case NL80211_CHAN_WIDTH_320: /* The behavior is not defined for 320 MHz channels */ WARN_ON(1); fallthrough; default: *pos++ = IEEE80211_VHT_CHANWIDTH_USE_HT; } /* new center frequency segment 0 */ *pos++ = ieee80211_frequency_to_channel(chandef->center_freq1); /* new center frequency segment 1 */ if (chandef->center_freq2) *pos++ = ieee80211_frequency_to_channel(chandef->center_freq2); else *pos++ = 0; } u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef) { struct ieee80211_vht_operation *vht_oper; *pos++ = WLAN_EID_VHT_OPERATION; *pos++ = sizeof(struct ieee80211_vht_operation); vht_oper = (struct ieee80211_vht_operation *)pos; vht_oper->center_freq_seg0_idx = ieee80211_frequency_to_channel( chandef->center_freq1); if (chandef->center_freq2) vht_oper->center_freq_seg1_idx = ieee80211_frequency_to_channel(chandef->center_freq2); else vht_oper->center_freq_seg1_idx = 0x00; switch (chandef->width) { case NL80211_CHAN_WIDTH_160: /* * Convert 160 MHz channel width to new style as interop * workaround. */ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; vht_oper->center_freq_seg1_idx = vht_oper->center_freq_seg0_idx; if (chandef->chan->center_freq < chandef->center_freq1) vht_oper->center_freq_seg0_idx -= 8; else vht_oper->center_freq_seg0_idx += 8; break; case NL80211_CHAN_WIDTH_80P80: /* * Convert 80+80 MHz channel width to new style as interop * workaround. */ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_80: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_320: /* VHT information element should not be included on 6GHz */ WARN_ON(1); return pos; default: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT; break; } /* don't require special VHT peer rates */ vht_oper->basic_mcs_set = cpu_to_le16(0xffff); return pos + sizeof(struct ieee80211_vht_operation); } u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef) { struct ieee80211_he_operation *he_oper; struct ieee80211_he_6ghz_oper *he_6ghz_op; struct cfg80211_chan_def he_chandef; u32 he_oper_params; u8 ie_len = 1 + sizeof(struct ieee80211_he_operation); if (chandef->chan->band == NL80211_BAND_6GHZ) ie_len += sizeof(struct ieee80211_he_6ghz_oper); *pos++ = WLAN_EID_EXTENSION; *pos++ = ie_len; *pos++ = WLAN_EID_EXT_HE_OPERATION; he_oper_params = 0; he_oper_params |= u32_encode_bits(1023, /* disabled */ IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_ER_SU_DISABLE); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); if (chandef->chan->band == NL80211_BAND_6GHZ) he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO); he_oper = (struct ieee80211_he_operation *)pos; he_oper->he_oper_params = cpu_to_le32(he_oper_params); /* don't require special HE peer rates */ he_oper->he_mcs_nss_set = cpu_to_le16(0xffff); pos += sizeof(struct ieee80211_he_operation); if (chandef->chan->band != NL80211_BAND_6GHZ) goto out; cfg80211_chandef_create(&he_chandef, chandef->chan, NL80211_CHAN_NO_HT); he_chandef.center_freq1 = chandef->center_freq1; he_chandef.center_freq2 = chandef->center_freq2; he_chandef.width = chandef->width; /* TODO add VHT operational */ he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos; he_6ghz_op->minrate = 6; /* 6 Mbps */ he_6ghz_op->primary = ieee80211_frequency_to_channel(he_chandef.chan->center_freq); he_6ghz_op->ccfs0 = ieee80211_frequency_to_channel(he_chandef.center_freq1); if (he_chandef.center_freq2) he_6ghz_op->ccfs1 = ieee80211_frequency_to_channel(he_chandef.center_freq2); else he_6ghz_op->ccfs1 = 0; switch (he_chandef.width) { case NL80211_CHAN_WIDTH_320: /* Downgrade EHT 320 MHz BW to 160 MHz for HE and set new * center_freq1 */ ieee80211_chandef_downgrade(&he_chandef, NULL); he_6ghz_op->ccfs0 = ieee80211_frequency_to_channel(he_chandef.center_freq1); fallthrough; case NL80211_CHAN_WIDTH_160: /* Convert 160 MHz channel width to new style as interop * workaround. */ he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0; if (he_chandef.chan->center_freq < he_chandef.center_freq1) he_6ghz_op->ccfs0 -= 8; else he_6ghz_op->ccfs0 += 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; break; default: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; break; } pos += sizeof(struct ieee80211_he_6ghz_oper); out: return pos; } u8 *ieee80211_ie_build_eht_oper(u8 *pos, const struct cfg80211_chan_def *chandef, const struct ieee80211_sta_eht_cap *eht_cap) { const struct ieee80211_eht_mcs_nss_supp_20mhz_only *eht_mcs_nss = &eht_cap->eht_mcs_nss_supp.only_20mhz; struct ieee80211_eht_operation *eht_oper; struct ieee80211_eht_operation_info *eht_oper_info; u8 eht_oper_len = offsetof(struct ieee80211_eht_operation, optional); u8 eht_oper_info_len = offsetof(struct ieee80211_eht_operation_info, optional); u8 chan_width = 0; *pos++ = WLAN_EID_EXTENSION; *pos++ = 1 + eht_oper_len + eht_oper_info_len; *pos++ = WLAN_EID_EXT_EHT_OPERATION; eht_oper = (struct ieee80211_eht_operation *)pos; memcpy(&eht_oper->basic_mcs_nss, eht_mcs_nss, sizeof(*eht_mcs_nss)); eht_oper->params |= IEEE80211_EHT_OPER_INFO_PRESENT; pos += eht_oper_len; eht_oper_info = (struct ieee80211_eht_operation_info *)eht_oper->optional; eht_oper_info->ccfs0 = ieee80211_frequency_to_channel(chandef->center_freq1); if (chandef->center_freq2) eht_oper_info->ccfs1 = ieee80211_frequency_to_channel(chandef->center_freq2); else eht_oper_info->ccfs1 = 0; switch (chandef->width) { case NL80211_CHAN_WIDTH_320: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ; eht_oper_info->ccfs1 = eht_oper_info->ccfs0; if (chandef->chan->center_freq < chandef->center_freq1) eht_oper_info->ccfs0 -= 16; else eht_oper_info->ccfs0 += 16; break; case NL80211_CHAN_WIDTH_160: eht_oper_info->ccfs1 = eht_oper_info->ccfs0; if (chandef->chan->center_freq < chandef->center_freq1) eht_oper_info->ccfs0 -= 8; else eht_oper_info->ccfs0 += 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ; break; default: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ; break; } eht_oper_info->control = chan_width; pos += eht_oper_info_len; /* TODO: eht_oper_info->optional */ return pos; } bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef) { enum nl80211_channel_type channel_type; if (!ht_oper) return false; switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_NONE: channel_type = NL80211_CHAN_HT20; break; case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: channel_type = NL80211_CHAN_HT40PLUS; break; case IEEE80211_HT_PARAM_CHA_SEC_BELOW: channel_type = NL80211_CHAN_HT40MINUS; break; default: return false; } cfg80211_chandef_create(chandef, chandef->chan, channel_type); return true; } bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef) { struct cfg80211_chan_def new = *chandef; int cf0, cf1; int ccfs0, ccfs1, ccfs2; int ccf0, ccf1; u32 vht_cap; bool support_80_80 = false; bool support_160 = false; u8 ext_nss_bw_supp = u32_get_bits(vht_cap_info, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); u8 supp_chwidth = u32_get_bits(vht_cap_info, IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); if (!oper || !htop) return false; vht_cap = hw->wiphy->bands[chandef->chan->band]->vht_cap.cap; support_160 = (vht_cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK | IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)); support_80_80 = ((vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) || (vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) || ((vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) >> IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT > 1)); ccfs0 = oper->center_freq_seg0_idx; ccfs1 = oper->center_freq_seg1_idx; ccfs2 = (le16_to_cpu(htop->operation_mode) & IEEE80211_HT_OP_MODE_CCFS2_MASK) >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT; ccf0 = ccfs0; /* if not supported, parse as though we didn't understand it */ if (!ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) ext_nss_bw_supp = 0; /* * Cf. IEEE 802.11 Table 9-250 * * We really just consider that because it's inefficient to connect * at a higher bandwidth than we'll actually be able to use. */ switch ((supp_chwidth << 4) | ext_nss_bw_supp) { default: case 0x00: ccf1 = 0; support_160 = false; support_80_80 = false; break; case 0x01: support_80_80 = false; fallthrough; case 0x02: case 0x03: ccf1 = ccfs2; break; case 0x10: ccf1 = ccfs1; break; case 0x11: case 0x12: if (!ccfs1) ccf1 = ccfs2; else ccf1 = ccfs1; break; case 0x13: case 0x20: case 0x23: ccf1 = ccfs1; break; } cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band); cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band); switch (oper->chan_width) { case IEEE80211_VHT_CHANWIDTH_USE_HT: /* just use HT information directly */ break; case IEEE80211_VHT_CHANWIDTH_80MHZ: new.width = NL80211_CHAN_WIDTH_80; new.center_freq1 = cf0; /* If needed, adjust based on the newer interop workaround. */ if (ccf1) { unsigned int diff; diff = abs(ccf1 - ccf0); if ((diff == 8) && support_160) { new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf1; } else if ((diff > 8) && support_80_80) { new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq2 = cf1; } } break; case IEEE80211_VHT_CHANWIDTH_160MHZ: /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf0; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq1 = cf0; new.center_freq2 = cf1; break; default: return false; } if (!cfg80211_chandef_valid(&new)) return false; *chandef = new; return true; } void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info, struct cfg80211_chan_def *chandef) { chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs0, chandef->chan->band); switch (u8_get_bits(info->control, IEEE80211_EHT_OPER_CHAN_WIDTH)) { case IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ: chandef->width = NL80211_CHAN_WIDTH_20; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ: chandef->width = NL80211_CHAN_WIDTH_40; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ: chandef->width = NL80211_CHAN_WIDTH_80; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ: chandef->width = NL80211_CHAN_WIDTH_160; chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs1, chandef->chan->band); break; case IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ: chandef->width = NL80211_CHAN_WIDTH_320; chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs1, chandef->chan->band); break; } } bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef) { struct cfg80211_chan_def he_chandef = *chandef; const struct ieee80211_he_6ghz_oper *he_6ghz_oper; u32 freq; if (chandef->chan->band != NL80211_BAND_6GHZ) return true; if (!he_oper) return false; he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); if (!he_6ghz_oper) return false; /* * The EHT operation IE does not contain the primary channel so the * primary channel frequency should be taken from the 6 GHz operation * information. */ freq = ieee80211_channel_to_frequency(he_6ghz_oper->primary, NL80211_BAND_6GHZ); he_chandef.chan = ieee80211_get_channel(local->hw.wiphy, freq); if (!he_chandef.chan) return false; if (!eht_oper || !(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) { switch (u8_get_bits(he_6ghz_oper->control, IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) { case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ: he_chandef.width = NL80211_CHAN_WIDTH_20; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ: he_chandef.width = NL80211_CHAN_WIDTH_40; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ: he_chandef.width = NL80211_CHAN_WIDTH_80; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ: he_chandef.width = NL80211_CHAN_WIDTH_80; if (!he_6ghz_oper->ccfs1) break; if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8) he_chandef.width = NL80211_CHAN_WIDTH_160; else he_chandef.width = NL80211_CHAN_WIDTH_80P80; break; } if (he_chandef.width == NL80211_CHAN_WIDTH_160) { he_chandef.center_freq1 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, NL80211_BAND_6GHZ); } else { he_chandef.center_freq1 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs0, NL80211_BAND_6GHZ); he_chandef.center_freq2 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, NL80211_BAND_6GHZ); } } else { ieee80211_chandef_eht_oper((const void *)eht_oper->optional, &he_chandef); he_chandef.punctured = ieee80211_eht_oper_dis_subchan_bitmap(eht_oper); } if (!cfg80211_chandef_valid(&he_chandef)) return false; *chandef = he_chandef; return true; } bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef) { u32 oper_freq; if (!oper) return false; switch (FIELD_GET(S1G_OPER_CH_WIDTH_OPER, oper->ch_width)) { case IEEE80211_S1G_CHANWIDTH_1MHZ: chandef->width = NL80211_CHAN_WIDTH_1; break; case IEEE80211_S1G_CHANWIDTH_2MHZ: chandef->width = NL80211_CHAN_WIDTH_2; break; case IEEE80211_S1G_CHANWIDTH_4MHZ: chandef->width = NL80211_CHAN_WIDTH_4; break; case IEEE80211_S1G_CHANWIDTH_8MHZ: chandef->width = NL80211_CHAN_WIDTH_8; break; case IEEE80211_S1G_CHANWIDTH_16MHZ: chandef->width = NL80211_CHAN_WIDTH_16; break; default: return false; } oper_freq = ieee80211_channel_to_freq_khz(oper->oper_ch, NL80211_BAND_S1GHZ); chandef->center_freq1 = KHZ_TO_MHZ(oper_freq); chandef->freq1_offset = oper_freq % 1000; return true; } int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, u32 basic_rates, u32 rate_flags, u32 masked_rates, u8 element_id) { u8 i, rates, skip; rates = 0; for (i = 0; i < sband->n_bitrates; i++) { if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; if (masked_rates & BIT(i)) continue; rates++; } if (element_id == WLAN_EID_SUPP_RATES) { rates = min_t(u8, rates, 8); skip = 0; } else { skip = 8; if (rates <= skip) return 0; rates -= skip; } if (skb_tailroom(skb) < rates + 2) return -ENOBUFS; skb_put_u8(skb, element_id); skb_put_u8(skb, rates); for (i = 0; i < sband->n_bitrates && rates; i++) { int rate; u8 basic; if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; if (masked_rates & BIT(i)) continue; if (skip > 0) { skip--; continue; } basic = basic_rates & BIT(i) ? 0x80 : 0; rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5); skb_put_u8(skb, basic | (u8)rate); rates--; } WARN(rates > 0, "rates confused: rates:%d, element:%d\n", rates, element_id); return 0; } int ieee80211_ave_rssi(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) return 0; return -ewma_beacon_signal_read(&sdata->deflink.u.mgd.ave_beacon_signal); } EXPORT_SYMBOL_GPL(ieee80211_ave_rssi); u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs) { if (!mcs) return 1; /* TODO: consider rx_highest */ if (mcs->rx_mask[3]) return 4; if (mcs->rx_mask[2]) return 3; if (mcs->rx_mask[1]) return 2; return 1; } /** * ieee80211_calculate_rx_timestamp - calculate timestamp in frame * @local: mac80211 hw info struct * @status: RX status * @mpdu_len: total MPDU length (including FCS) * @mpdu_offset: offset into MPDU to calculate timestamp at * * This function calculates the RX timestamp at the given MPDU offset, taking * into account what the RX timestamp was. An offset of 0 will just normalize * the timestamp to TSF at beginning of MPDU reception. * * Returns: the calculated timestamp */ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, struct ieee80211_rx_status *status, unsigned int mpdu_len, unsigned int mpdu_offset) { u64 ts = status->mactime; bool mactime_plcp_start; struct rate_info ri; u16 rate; u8 n_ltf; if (WARN_ON(!ieee80211_have_rx_timestamp(status))) return 0; mactime_plcp_start = (status->flag & RX_FLAG_MACTIME) == RX_FLAG_MACTIME_PLCP_START; memset(&ri, 0, sizeof(ri)); ri.bw = status->bw; /* Fill cfg80211 rate info */ switch (status->encoding) { case RX_ENC_EHT: ri.flags |= RATE_INFO_FLAGS_EHT_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; ri.eht_ru_alloc = status->eht.ru; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* TODO/FIXME: is this right? handle other PPDUs */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; } break; case RX_ENC_HE: ri.flags |= RATE_INFO_FLAGS_HE_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; ri.he_ru_alloc = status->he_ru; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11ax_D6.0, section 27.3.4 for * VHT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; /* * TODO: * For HE MU PPDU, add the HE-SIG-B. * For HE ER PPDU, add 8us for the HE-SIG-A. * For HE TB PPDU, add 4us for the HE-STF. * Add the HE-LTF durations - variable. */ } break; case RX_ENC_HT: ri.mcs = status->rate_idx; ri.flags |= RATE_INFO_FLAGS_MCS; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11REVmd_D3.0, section 19.3.2 for * HT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; if (status->enc_flags & RX_ENC_FLAG_HT_GF) ts += 24; else ts += 32; /* * Add Data HT-LTFs per streams * TODO: add Extension HT-LTFs, 4us per LTF */ n_ltf = ((ri.mcs >> 3) & 3) + 1; n_ltf = n_ltf == 3 ? 4 : n_ltf; ts += n_ltf * 4; } break; case RX_ENC_VHT: ri.flags |= RATE_INFO_FLAGS_VHT_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11REVmd_D3.0, section 21.3.2 for * VHT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; /* * Add VHT-LTFs per streams */ n_ltf = (ri.nss != 1) && (ri.nss % 2) ? ri.nss + 1 : ri.nss; ts += 4 * n_ltf; } break; default: WARN_ON(1); fallthrough; case RX_ENC_LEGACY: { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[status->band]; ri.legacy = sband->bitrates[status->rate_idx].bitrate; if (mactime_plcp_start) { if (status->band == NL80211_BAND_5GHZ) { ts += 20; mpdu_offset += 2; } else if (status->enc_flags & RX_ENC_FLAG_SHORTPRE) { ts += 96; } else { ts += 192; } } break; } } rate = cfg80211_calculate_bitrate(&ri); if (WARN_ONCE(!rate, "Invalid bitrate: flags=0x%llx, idx=%d, vht_nss=%d\n", (unsigned long long)status->flag, status->rate_idx, status->nss)) return 0; /* rewind from end of MPDU */ if ((status->flag & RX_FLAG_MACTIME) == RX_FLAG_MACTIME_END) ts -= mpdu_len * 8 * 10 / rate; ts += mpdu_offset * 8 * 10 / rate; return ts; } /* Cancel CAC for the interfaces under the specified @local. If @ctx is * also provided, only the interfaces using that ctx will be canceled. */ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_sub_if_data *sdata; struct cfg80211_chan_def chandef; struct ieee80211_link_data *link; struct ieee80211_chanctx_conf *chanctx_conf; unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; chanctx_conf = sdata_dereference(link->conf->chanctx_conf, sdata); if (ctx && &ctx->conf != chanctx_conf) continue; wiphy_delayed_work_cancel(local->hw.wiphy, &link->dfs_cac_timer_work); if (!sdata->wdev.links[link_id].cac_started) continue; chandef = link->conf->chanreq.oper; ieee80211_link_release_channel(link); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL, link_id); } } } void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, radar_detected_work); struct cfg80211_chan_def chandef; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (!ctx->radar_detected) continue; ctx->radar_detected = false; chandef = ctx->conf.def; ieee80211_dfs_cac_cancel(local, ctx); cfg80211_radar_event(local->hw.wiphy, &chandef, GFP_KERNEL); } } static void ieee80211_radar_mark_chan_ctx_iterator(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf, void *data) { struct ieee80211_chanctx *ctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) return; if (data && data != chanctx_conf) return; ctx->radar_detected = true; } void ieee80211_radar_detected(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf) { struct ieee80211_local *local = hw_to_local(hw); trace_api_radar_detected(local); ieee80211_iter_chan_contexts_atomic(hw, ieee80211_radar_mark_chan_ctx_iterator, chanctx_conf); wiphy_work_queue(hw->wiphy, &local->radar_detected_work); } EXPORT_SYMBOL(ieee80211_radar_detected); void ieee80211_chandef_downgrade(struct cfg80211_chan_def *c, struct ieee80211_conn_settings *conn) { enum nl80211_chan_width new_primary_width; struct ieee80211_conn_settings _ignored = {}; /* allow passing NULL if caller doesn't care */ if (!conn) conn = &_ignored; again: /* no-HT indicates nothing to do */ new_primary_width = NL80211_CHAN_WIDTH_20_NOHT; switch (c->width) { default: case NL80211_CHAN_WIDTH_20_NOHT: WARN_ON_ONCE(1); fallthrough; case NL80211_CHAN_WIDTH_20: c->width = NL80211_CHAN_WIDTH_20_NOHT; conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; c->punctured = 0; break; case NL80211_CHAN_WIDTH_40: c->width = NL80211_CHAN_WIDTH_20; c->center_freq1 = c->chan->center_freq; if (conn->mode == IEEE80211_CONN_MODE_VHT) conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; c->punctured = 0; break; case NL80211_CHAN_WIDTH_80: new_primary_width = NL80211_CHAN_WIDTH_40; if (conn->mode == IEEE80211_CONN_MODE_VHT) conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_40; break; case NL80211_CHAN_WIDTH_80P80: c->center_freq2 = 0; c->width = NL80211_CHAN_WIDTH_80; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; break; case NL80211_CHAN_WIDTH_160: new_primary_width = NL80211_CHAN_WIDTH_80; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; break; case NL80211_CHAN_WIDTH_320: new_primary_width = NL80211_CHAN_WIDTH_160; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_160; break; case NL80211_CHAN_WIDTH_1: case NL80211_CHAN_WIDTH_2: case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: WARN_ON_ONCE(1); /* keep c->width */ conn->mode = IEEE80211_CONN_MODE_S1G; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: WARN_ON_ONCE(1); /* keep c->width */ conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; } if (new_primary_width != NL80211_CHAN_WIDTH_20_NOHT) { c->center_freq1 = cfg80211_chandef_primary(c, new_primary_width, &c->punctured); c->width = new_primary_width; } /* * With an 80 MHz channel, we might have the puncturing in the primary * 40 Mhz channel, but that's not valid when downgraded to 40 MHz width. * In that case, downgrade again. */ if (!cfg80211_chandef_valid(c) && c->punctured) goto again; WARN_ON_ONCE(!cfg80211_chandef_valid(c)); } int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings) { struct sk_buff *skb; struct ieee80211_mgmt *mgmt; struct ieee80211_local *local = sdata->local; int freq; int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.chan_switch); u8 *pos; if (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT) return -EOPNOTSUPP; skb = dev_alloc_skb(local->tx_headroom + hdr_len + 5 + /* channel switch announcement element */ 3 + /* secondary channel offset element */ 5 + /* wide bandwidth channel switch announcement */ 8); /* mesh channel switch parameters element */ if (!skb) return -ENOMEM; skb_reserve(skb, local->tx_headroom); mgmt = skb_put_zero(skb, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); eth_broadcast_addr(mgmt->da); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); if (ieee80211_vif_is_mesh(&sdata->vif)) { memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); } else { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); } mgmt->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; mgmt->u.action.u.chan_switch.action_code = WLAN_ACTION_SPCT_CHL_SWITCH; pos = skb_put(skb, 5); *pos++ = WLAN_EID_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ *pos++ = csa_settings->block_tx ? 1 : 0; /* CSA mode */ freq = csa_settings->chandef.chan->center_freq; *pos++ = ieee80211_frequency_to_channel(freq); /* channel */ *pos++ = csa_settings->count; /* count */ if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_40) { enum nl80211_channel_type ch_type; skb_put(skb, 3); *pos++ = WLAN_EID_SECONDARY_CHANNEL_OFFSET; /* EID */ *pos++ = 1; /* IE length */ ch_type = cfg80211_get_chandef_type(&csa_settings->chandef); if (ch_type == NL80211_CHAN_HT40PLUS) *pos++ = IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else *pos++ = IEEE80211_HT_PARAM_CHA_SEC_BELOW; } if (ieee80211_vif_is_mesh(&sdata->vif)) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; skb_put(skb, 8); *pos++ = WLAN_EID_CHAN_SWITCH_PARAM; /* EID */ *pos++ = 6; /* IE length */ *pos++ = sdata->u.mesh.mshcfg.dot11MeshTTL; /* Mesh TTL */ *pos = 0x00; /* Mesh Flag: Tx Restrict, Initiator, Reason */ *pos |= WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR; *pos++ |= csa_settings->block_tx ? WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT : 0x00; put_unaligned_le16(WLAN_REASON_MESH_CHAN, pos); /* Reason Cd */ pos += 2; put_unaligned_le16(ifmsh->pre_value, pos);/* Precedence Value */ pos += 2; } if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_80 || csa_settings->chandef.width == NL80211_CHAN_WIDTH_80P80 || csa_settings->chandef.width == NL80211_CHAN_WIDTH_160) { skb_put(skb, 5); ieee80211_ie_build_wide_bw_cs(pos, &csa_settings->chandef); } ieee80211_tx_skb(sdata, skb); return 0; } static bool ieee80211_extend_noa_desc(struct ieee80211_noa_data *data, u32 tsf, int i) { s32 end = data->desc[i].start + data->desc[i].duration - (tsf + 1); int skip; if (end > 0) return false; /* One shot NOA */ if (data->count[i] == 1) return false; if (data->desc[i].interval == 0) return false; /* End time is in the past, check for repetitions */ skip = DIV_ROUND_UP(-end, data->desc[i].interval); if (data->count[i] < 255) { if (data->count[i] <= skip) { data->count[i] = 0; return false; } data->count[i] -= skip; } data->desc[i].start += skip * data->desc[i].interval; return true; } static bool ieee80211_extend_absent_time(struct ieee80211_noa_data *data, u32 tsf, s32 *offset) { bool ret = false; int i; for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { s32 cur; if (!data->count[i]) continue; if (ieee80211_extend_noa_desc(data, tsf + *offset, i)) ret = true; cur = data->desc[i].start - tsf; if (cur > *offset) continue; cur = data->desc[i].start + data->desc[i].duration - tsf; if (cur > *offset) *offset = cur; } return ret; } static u32 ieee80211_get_noa_absent_time(struct ieee80211_noa_data *data, u32 tsf) { s32 offset = 0; int tries = 0; /* * arbitrary limit, used to avoid infinite loops when combined NoA * descriptors cover the full time period. */ int max_tries = 5; ieee80211_extend_absent_time(data, tsf, &offset); do { if (!ieee80211_extend_absent_time(data, tsf, &offset)) break; tries++; } while (tries < max_tries); return offset; } void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf) { u32 next_offset = BIT(31) - 1; int i; data->absent = 0; data->has_next_tsf = false; for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { s32 start; if (!data->count[i]) continue; ieee80211_extend_noa_desc(data, tsf, i); start = data->desc[i].start - tsf; if (start <= 0) data->absent |= BIT(i); if (next_offset > start) next_offset = start; data->has_next_tsf = true; } if (data->absent) next_offset = ieee80211_get_noa_absent_time(data, tsf); data->next_tsf = tsf + next_offset; } EXPORT_SYMBOL(ieee80211_update_p2p_noa); int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr, struct ieee80211_noa_data *data, u32 tsf) { int ret = 0; int i; memset(data, 0, sizeof(*data)); for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { const struct ieee80211_p2p_noa_desc *desc = &attr->desc[i]; if (!desc->count || !desc->duration) continue; data->count[i] = desc->count; data->desc[i].start = le32_to_cpu(desc->start_time); data->desc[i].duration = le32_to_cpu(desc->duration); data->desc[i].interval = le32_to_cpu(desc->interval); if (data->count[i] > 1 && data->desc[i].interval < data->desc[i].duration) continue; ieee80211_extend_noa_desc(data, tsf, i); ret++; } if (ret) ieee80211_update_p2p_noa(data, tsf); return ret; } EXPORT_SYMBOL(ieee80211_parse_p2p_noa); void ieee80211_recalc_dtim(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { u64 tsf = drv_get_tsf(local, sdata); u64 dtim_count = 0; u16 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; u8 dtim_period = sdata->vif.bss_conf.dtim_period; struct ps_data *ps; u8 bcns_from_dtim; if (tsf == -1ULL || !beacon_int || !dtim_period) return; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (!sdata->bss) return; ps = &sdata->bss->ps; } else if (ieee80211_vif_is_mesh(&sdata->vif)) { ps = &sdata->u.mesh.ps; } else { return; } /* * actually finds last dtim_count, mac80211 will update in * __beacon_add_tim(). * dtim_count = dtim_period - (tsf / bcn_int) % dtim_period */ do_div(tsf, beacon_int); bcns_from_dtim = do_div(tsf, dtim_period); /* just had a DTIM */ if (!bcns_from_dtim) dtim_count = 0; else dtim_count = dtim_period - bcns_from_dtim; ps->dtim_count = dtim_count; } static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_link_data *link; u8 radar_detect = 0; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)) return 0; list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) if (link->reserved_radar_required) radar_detect |= BIT(link->reserved.oper.width); /* * An in-place reservation context should not have any assigned vifs * until it replaces the other context. */ WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER && !list_empty(&ctx->assigned_links)); list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) { if (!link->radar_required) continue; radar_detect |= BIT(link->conf->chanreq.oper.width); } return radar_detect; } static u32 __ieee80211_get_radio_mask(struct ieee80211_sub_if_data *sdata) { struct ieee80211_bss_conf *link_conf; struct ieee80211_chanctx_conf *conf; unsigned int link_id; u32 mask = 0; for_each_vif_active_link(&sdata->vif, link_conf, link_id) { conf = sdata_dereference(link_conf->chanctx_conf, sdata); if (!conf || conf->radio_idx < 0) continue; mask |= BIT(conf->radio_idx); } return mask; } u32 ieee80211_get_radio_mask(struct wiphy *wiphy, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); return __ieee80211_get_radio_mask(sdata); } static bool ieee80211_sdata_uses_radio(struct ieee80211_sub_if_data *sdata, int radio_idx) { if (radio_idx < 0) return true; return __ieee80211_get_radio_mask(sdata) & BIT(radio_idx); } static int ieee80211_fill_ifcomb_params(struct ieee80211_local *local, struct iface_combination_params *params, const struct cfg80211_chan_def *chandef, struct ieee80211_sub_if_data *sdata) { struct ieee80211_sub_if_data *sdata_iter; struct ieee80211_chanctx *ctx; int total = !!sdata; list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) continue; if (params->radio_idx >= 0 && ctx->conf.radio_idx != params->radio_idx) continue; params->radar_detect |= ieee80211_chanctx_radar_detect(local, ctx); if (chandef && ctx->mode != IEEE80211_CHANCTX_EXCLUSIVE && cfg80211_chandef_compatible(chandef, &ctx->conf.def)) continue; params->num_different_channels++; } list_for_each_entry(sdata_iter, &local->interfaces, list) { struct wireless_dev *wdev_iter; wdev_iter = &sdata_iter->wdev; if (sdata_iter == sdata || !ieee80211_sdata_running(sdata_iter) || cfg80211_iftype_allowed(local->hw.wiphy, wdev_iter->iftype, 0, 1)) continue; if (!ieee80211_sdata_uses_radio(sdata_iter, params->radio_idx)) continue; params->iftype_num[wdev_iter->iftype]++; total++; } return total; } int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode chanmode, u8 radar_detect, int radio_idx) { bool shared = chanmode == IEEE80211_CHANCTX_SHARED; struct ieee80211_local *local = sdata->local; enum nl80211_iftype iftype = sdata->wdev.iftype; struct iface_combination_params params = { .radar_detect = radar_detect, .radio_idx = radio_idx, }; int total; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(hweight32(radar_detect) > 1)) return -EINVAL; if (WARN_ON(chandef && chanmode == IEEE80211_CHANCTX_SHARED && !chandef->chan)) return -EINVAL; if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) return -EINVAL; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_MESH_POINT) { /* * always passing this is harmless, since it'll be the * same value that cfg80211 finds if it finds the same * interface ... and that's always allowed */ params.new_beacon_int = sdata->vif.bss_conf.beacon_int; } /* Always allow software iftypes */ if (cfg80211_iftype_allowed(local->hw.wiphy, iftype, 0, 1)) { if (radar_detect) return -EINVAL; return 0; } if (chandef) params.num_different_channels = 1; if (iftype != NL80211_IFTYPE_UNSPECIFIED) params.iftype_num[iftype] = 1; total = ieee80211_fill_ifcomb_params(local, &params, shared ? chandef : NULL, sdata); if (total == 1 && !params.radar_detect) return 0; return cfg80211_check_combinations(local->hw.wiphy, &params); } static void ieee80211_iter_max_chans(const struct ieee80211_iface_combination *c, void *data) { u32 *max_num_different_channels = data; *max_num_different_channels = max(*max_num_different_channels, c->num_different_channels); } int ieee80211_max_num_channels(struct ieee80211_local *local, int radio_idx) { u32 max_num_different_channels = 1; int err; struct iface_combination_params params = { .radio_idx = radio_idx, }; lockdep_assert_wiphy(local->hw.wiphy); ieee80211_fill_ifcomb_params(local, &params, NULL, NULL); err = cfg80211_iter_combinations(local->hw.wiphy, &params, ieee80211_iter_max_chans, &max_num_different_channels); if (err < 0) return err; return max_num_different_channels; } void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_s1g_cap *caps, struct sk_buff *skb) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_s1g_cap s1g_capab; u8 *pos; int i; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; if (!caps->s1g) return; memcpy(s1g_capab.capab_info, caps->cap, sizeof(caps->cap)); memcpy(s1g_capab.supp_mcs_nss, caps->nss_mcs, sizeof(caps->nss_mcs)); /* override the capability info */ for (i = 0; i < sizeof(ifmgd->s1g_capa.capab_info); i++) { u8 mask = ifmgd->s1g_capa_mask.capab_info[i]; s1g_capab.capab_info[i] &= ~mask; s1g_capab.capab_info[i] |= ifmgd->s1g_capa.capab_info[i] & mask; } /* then MCS and NSS set */ for (i = 0; i < sizeof(ifmgd->s1g_capa.supp_mcs_nss); i++) { u8 mask = ifmgd->s1g_capa_mask.supp_mcs_nss[i]; s1g_capab.supp_mcs_nss[i] &= ~mask; s1g_capab.supp_mcs_nss[i] |= ifmgd->s1g_capa.supp_mcs_nss[i] & mask; } pos = skb_put(skb, 2 + sizeof(s1g_capab)); *pos++ = WLAN_EID_S1G_CAPABILITIES; *pos++ = sizeof(s1g_capab); memcpy(pos, &s1g_capab, sizeof(s1g_capab)); } void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { u8 *pos = skb_put(skb, 3); *pos++ = WLAN_EID_AID_REQUEST; *pos++ = 1; *pos++ = 0; } u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo) { *buf++ = WLAN_EID_VENDOR_SPECIFIC; *buf++ = 7; /* len */ *buf++ = 0x00; /* Microsoft OUI 00:50:F2 */ *buf++ = 0x50; *buf++ = 0xf2; *buf++ = 2; /* WME */ *buf++ = 0; /* WME info */ *buf++ = 1; /* WME ver */ *buf++ = qosinfo; /* U-APSD no in use */ return buf; } void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *frame_cnt, unsigned long *byte_cnt) { struct txq_info *txqi = to_txq_info(txq); u32 frag_cnt = 0, frag_bytes = 0; struct sk_buff *skb; skb_queue_walk(&txqi->frags, skb) { frag_cnt++; frag_bytes += skb->len; } if (frame_cnt) *frame_cnt = txqi->tin.backlog_packets + frag_cnt; if (byte_cnt) *byte_cnt = txqi->tin.backlog_bytes + frag_bytes; } EXPORT_SYMBOL(ieee80211_txq_get_depth); const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS] = { IEEE80211_WMM_IE_STA_QOSINFO_AC_VO, IEEE80211_WMM_IE_STA_QOSINFO_AC_VI, IEEE80211_WMM_IE_STA_QOSINFO_AC_BE, IEEE80211_WMM_IE_STA_QOSINFO_AC_BK }; u16 ieee80211_encode_usf(int listen_interval) { static const int listen_int_usf[] = { 1, 10, 1000, 10000 }; u16 ui, usf = 0; /* find greatest USF */ while (usf < IEEE80211_MAX_USF) { if (listen_interval % listen_int_usf[usf + 1]) break; usf += 1; } ui = listen_interval / listen_int_usf[usf]; /* error if there is a remainder. Should've been checked by user */ WARN_ON_ONCE(ui > IEEE80211_MAX_UI); listen_interval = FIELD_PREP(LISTEN_INT_USF, usf) | FIELD_PREP(LISTEN_INT_UI, ui); return (u16) listen_interval; } /* this may return more than ieee80211_put_eht_cap() will need */ u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata) { const struct ieee80211_sta_he_cap *he_cap; const struct ieee80211_sta_eht_cap *eht_cap; struct ieee80211_supported_band *sband; bool is_ap; u8 n; sband = ieee80211_get_sband(sdata); if (!sband) return 0; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); if (!he_cap || !eht_cap) return 0; is_ap = sdata->vif.type == NL80211_IFTYPE_AP; n = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, &eht_cap->eht_cap_elem, is_ap); return 2 + 1 + sizeof(eht_cap->eht_cap_elem) + n + ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0], eht_cap->eht_cap_elem.phy_cap_info); return 0; } int ieee80211_put_eht_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn) { const struct ieee80211_sta_he_cap *he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); const struct ieee80211_sta_eht_cap *eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); bool for_ap = sdata->vif.type == NL80211_IFTYPE_AP; struct ieee80211_eht_cap_elem_fixed fixed; struct ieee80211_he_cap_elem he; u8 mcs_nss_len, ppet_len; u8 orig_mcs_nss_len; u8 ie_len; if (!conn) conn = &ieee80211_conn_settings_unlimited; /* Make sure we have place for the IE */ if (!he_cap || !eht_cap) return 0; orig_mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, &eht_cap->eht_cap_elem, for_ap); ieee80211_get_adjusted_he_cap(conn, he_cap, &he); fixed = eht_cap->eht_cap_elem; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_80) fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) { fixed.phy_cap_info[1] &= ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK; fixed.phy_cap_info[2] &= ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK; fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ; } if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320) { fixed.phy_cap_info[0] &= ~IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ; fixed.phy_cap_info[1] &= ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK; fixed.phy_cap_info[2] &= ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK; fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ; } if (conn->bw_limit == IEEE80211_CONN_BW_LIMIT_20) fixed.phy_cap_info[0] &= ~IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ; mcs_nss_len = ieee80211_eht_mcs_nss_size(&he, &fixed, for_ap); ppet_len = ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0], fixed.phy_cap_info); ie_len = 2 + 1 + sizeof(eht_cap->eht_cap_elem) + mcs_nss_len + ppet_len; if (skb_tailroom(skb) < ie_len) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); skb_put_u8(skb, ie_len - 2); skb_put_u8(skb, WLAN_EID_EXT_EHT_CAPABILITY); skb_put_data(skb, &fixed, sizeof(fixed)); if (mcs_nss_len == 4 && orig_mcs_nss_len != 4) { /* * If the (non-AP) STA became 20 MHz only, then convert from * <=80 to 20-MHz-only format, where MCSes are indicated in * the groups 0-7, 8-9, 10-11, 12-13 rather than just 0-9, * 10-11, 12-13. Thus, use 0-9 for 0-7 and 8-9. */ skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs11_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs13_max_nss); } else { skb_put_data(skb, &eht_cap->eht_mcs_nss_supp, mcs_nss_len); } if (ppet_len) skb_put_data(skb, &eht_cap->eht_ppe_thres, ppet_len); return 0; } const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode) { static const char * const modes[] = { [IEEE80211_CONN_MODE_S1G] = "S1G", [IEEE80211_CONN_MODE_LEGACY] = "legacy", [IEEE80211_CONN_MODE_HT] = "HT", [IEEE80211_CONN_MODE_VHT] = "VHT", [IEEE80211_CONN_MODE_HE] = "HE", [IEEE80211_CONN_MODE_EHT] = "EHT", }; if (WARN_ON(mode >= ARRAY_SIZE(modes))) return "<out of range>"; return modes[mode] ?: "<missing string>"; } enum ieee80211_conn_bw_limit ieee80211_min_bw_limit_from_chandef(struct cfg80211_chan_def *chandef) { switch (chandef->width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: return IEEE80211_CONN_BW_LIMIT_20; case NL80211_CHAN_WIDTH_40: return IEEE80211_CONN_BW_LIMIT_40; case NL80211_CHAN_WIDTH_80: return IEEE80211_CONN_BW_LIMIT_80; case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: return IEEE80211_CONN_BW_LIMIT_160; case NL80211_CHAN_WIDTH_320: return IEEE80211_CONN_BW_LIMIT_320; default: WARN(1, "unhandled chandef width %d\n", chandef->width); return IEEE80211_CONN_BW_LIMIT_20; } } void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe) { for (int i = 0; i < 2; i++) { tpe->max_local[i].valid = false; memset(tpe->max_local[i].power, IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT, sizeof(tpe->max_local[i].power)); tpe->max_reg_client[i].valid = false; memset(tpe->max_reg_client[i].power, IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT, sizeof(tpe->max_reg_client[i].power)); tpe->psd_local[i].valid = false; memset(tpe->psd_local[i].power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(tpe->psd_local[i].power)); tpe->psd_reg_client[i].valid = false; memset(tpe->psd_reg_client[i].power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(tpe->psd_reg_client[i].power)); } }
65 11 5 5 11 11 11 11 11 11 11 11 6 9 9 5 5 5 5 5 5 5 5 5 1 2 11 11 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 // SPDX-License-Identifier: GPL-2.0-only /* * Generic show_mem() implementation * * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de> */ #include <linux/blkdev.h> #include <linux/cma.h> #include <linux/cpuset.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/swap.h> #include <linux/vmstat.h> #include "internal.h" #include "swap.h" atomic_long_t _totalram_pages __read_mostly; EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; static inline void show_node(struct zone *zone) { if (IS_ENABLED(CONFIG_NUMA)) printk("Node %d ", zone_to_nid(zone)); } long si_mem_available(void) { long available; unsigned long pagecache; unsigned long wmark_low = 0; unsigned long reclaimable; struct zone *zone; for_each_zone(zone) wmark_low += low_wmark_pages(zone); /* * Estimate the amount of memory available for userspace allocations, * without causing swapping or OOM. */ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will * start swapping or thrashing. Assume at least half of the page * cache, or the low watermark worth of cache, needs to stay. */ pagecache = global_node_page_state(NR_ACTIVE_FILE) + global_node_page_state(NR_INACTIVE_FILE); pagecache -= min(pagecache / 2, wmark_low); available += pagecache; /* * Part of the reclaimable slab and other kernel memory consists of * items that are in use, and cannot be freed. Cap this estimate at the * low watermark. */ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); reclaimable -= min(reclaimable / 2, wmark_low); available += reclaimable; if (available < 0) available = 0; return available; } EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages(); val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_zone_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages(); val->freehigh = nr_free_highpages(); val->mem_unit = PAGE_SIZE; } EXPORT_SYMBOL(si_meminfo); #ifdef CONFIG_NUMA void si_meminfo_node(struct sysinfo *val, int nid) { int zone_type; /* needs to be signed */ unsigned long managed_pages = 0; unsigned long managed_highpages = 0; unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); val->totalram = managed_pages; val->sharedram = node_page_state(pgdat, NR_SHMEM); val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; if (is_highmem(zone)) { managed_highpages += zone_managed_pages(zone); free_highpages += zone_page_state(zone, NR_FREE_PAGES); } } val->totalhigh = managed_highpages; val->freehigh = free_highpages; #else val->totalhigh = managed_highpages; val->freehigh = free_highpages; #endif val->mem_unit = PAGE_SIZE; } #endif /* * Determine whether the node should be displayed or not, depending on whether * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). */ static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) { if (!(flags & SHOW_MEM_FILTER_NODES)) return false; /* * no node mask - aka implicit memory numa policy. Do not bother with * the synchronization - read_mems_allowed_begin - because we do not * have to be precise here. */ if (!nodemask) nodemask = &cpuset_current_mems_allowed; return !node_isset(nid, *nodemask); } static void show_migration_types(unsigned char type) { static const char types[MIGRATE_TYPES] = { [MIGRATE_UNMOVABLE] = 'U', [MIGRATE_MOVABLE] = 'M', [MIGRATE_RECLAIMABLE] = 'E', [MIGRATE_HIGHATOMIC] = 'H', #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif #ifdef CONFIG_MEMORY_ISOLATION [MIGRATE_ISOLATE] = 'I', #endif }; char tmp[MIGRATE_TYPES + 1]; char *p = tmp; int i; for (i = 0; i < MIGRATE_TYPES; i++) { if (type & (1 << i)) *p++ = types[i]; } *p = '\0'; printk(KERN_CONT "(%s) ", tmp); } static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) { int zone_idx; for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) if (zone_managed_pages(pgdat->node_zones + zone_idx)) return true; return false; } /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. * * Bits in @filter: * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long free_pcp = 0; int cpu, nid; struct zone *zone; pg_data_t *pgdat; for_each_populated_zone(zone) { if (zone_idx(zone) > max_zone_idx) continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; for_each_online_cpu(cpu) free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; } printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu\n" " sec_pagetables:%lu bounce:%lu\n" " kernel_misc_reclaimable:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", global_node_page_state(NR_ACTIVE_ANON), global_node_page_state(NR_INACTIVE_ANON), global_node_page_state(NR_ISOLATED_ANON), global_node_page_state(NR_ACTIVE_FILE), global_node_page_state(NR_INACTIVE_FILE), global_node_page_state(NR_ISOLATED_FILE), global_node_page_state(NR_UNEVICTABLE), global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), global_node_page_state(NR_SECONDARY_PAGETABLE), global_zone_page_state(NR_BOUNCE), global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), free_pcp, global_zone_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) continue; if (!node_has_managed_zones(pgdat, max_zone_idx)) continue; printk("Node %d" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" " isolated(anon):%lukB" " isolated(file):%lukB" " mapped:%lukB" " dirty:%lukB" " writeback:%lukB" " shmem:%lukB" #ifdef CONFIG_TRANSPARENT_HUGEPAGE " shmem_thp:%lukB" " shmem_pmdmapped:%lukB" " anon_thp:%lukB" #endif " writeback_tmp:%lukB" " kernel_stack:%lukB" #ifdef CONFIG_SHADOW_CALL_STACK " shadow_call_stack:%lukB" #endif " pagetables:%lukB" " sec_pagetables:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, K(node_page_state(pgdat, NR_ACTIVE_ANON)), K(node_page_state(pgdat, NR_INACTIVE_ANON)), K(node_page_state(pgdat, NR_ACTIVE_FILE)), K(node_page_state(pgdat, NR_INACTIVE_FILE)), K(node_page_state(pgdat, NR_UNEVICTABLE)), K(node_page_state(pgdat, NR_ISOLATED_ANON)), K(node_page_state(pgdat, NR_ISOLATED_FILE)), K(node_page_state(pgdat, NR_FILE_MAPPED)), K(node_page_state(pgdat, NR_FILE_DIRTY)), K(node_page_state(pgdat, NR_WRITEBACK)), K(node_page_state(pgdat, NR_SHMEM)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE K(node_page_state(pgdat, NR_SHMEM_THPS)), K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), K(node_page_state(pgdat, NR_ANON_THPS)), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), node_page_state(pgdat, NR_KERNEL_STACK_KB), #ifdef CONFIG_SHADOW_CALL_STACK node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)); } for_each_populated_zone(zone) { int i; if (zone_idx(zone) > max_zone_idx) continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; free_pcp = 0; for_each_online_cpu(cpu) free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; show_node(zone); printk(KERN_CONT "%s" " free:%lukB" " boost:%lukB" " min:%lukB" " low:%lukB" " high:%lukB" " reserved_highatomic:%luKB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" " writepending:%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" " free_cma:%lukB" "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), K(zone->watermark_boost), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), K(zone->nr_reserved_highatomic), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->per_cpu_pageset->count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); printk(KERN_CONT "\n"); } for_each_populated_zone(zone) { unsigned int order; unsigned long nr[NR_PAGE_ORDERS], flags, total = 0; unsigned char types[NR_PAGE_ORDERS]; if (zone_idx(zone) > max_zone_idx) continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &zone->free_area[order]; int type; nr[order] = area->nr_free; total += nr[order] << order; types[order] = 0; for (type = 0; type < MIGRATE_TYPES; type++) { if (!free_area_empty(area, type)) types[order] |= 1 << type; } } spin_unlock_irqrestore(&zone->lock, flags); for (order = 0; order < NR_PAGE_ORDERS; order++) { printk(KERN_CONT "%lu*%lukB ", nr[order], K(1UL) << order); if (nr[order]) show_migration_types(types[order]); } printk(KERN_CONT "= %lukB\n", K(total)); } for_each_online_node(nid) { if (show_mem_node_skip(filter, nid, nodemask)) continue; hugetlb_show_meminfo_node(nid); } printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); show_swap_cache_info(); } void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long total = 0, reserved = 0, highmem = 0; struct zone *zone; printk("Mem-Info:\n"); show_free_areas(filter, nodemask, max_zone_idx); for_each_populated_zone(zone) { total += zone->present_pages; reserved += zone->present_pages - zone_managed_pages(zone); if (is_highmem(zone)) highmem += zone->present_pages; } printk("%lu pages RAM\n", total); printk("%lu pages HighMem/MovableOnly\n", highmem); printk("%lu pages reserved\n", reserved); #ifdef CONFIG_CMA printk("%lu pages cma reserved\n", totalcma_pages); #endif #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif #ifdef CONFIG_MEM_ALLOC_PROFILING { struct codetag_bytes tags[10]; size_t i, nr; nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false); if (nr) { pr_notice("Memory allocations:\n"); for (i = 0; i < nr; i++) { struct codetag *ct = tags[i].ct; struct alloc_tag *tag = ct_to_alloc_tag(ct); struct alloc_tag_counters counter = alloc_tag_read(tag); char bytes[10]; string_get_size(counter.bytes, 1, STRING_UNITS_2, bytes, sizeof(bytes)); /* Same as alloc_tag_to_text() but w/o intermediate buffer */ if (ct->modname) pr_notice("%12s %8llu %s:%u [%s] func:%s\n", bytes, counter.calls, ct->filename, ct->lineno, ct->modname, ct->function); else pr_notice("%12s %8llu %s:%u func:%s\n", bytes, counter.calls, ct->filename, ct->lineno, ct->function); } } } #endif }
28 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 /* * net/tipc/core.c: TIPC module code * * Copyright (c) 2003-2006, 2013, Ericsson AB * Copyright (c) 2005-2006, 2010-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "name_table.h" #include "subscr.h" #include "bearer.h" #include "net.h" #include "socket.h" #include "bcast.h" #include "node.h" #include "crypto.h" #include <linux/module.h> /* configurable TIPC parameters */ unsigned int tipc_net_id __read_mostly; int sysctl_tipc_rmem[3] __read_mostly; /* min/default/max */ static int __net_init tipc_init_net(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); int err; tn->net_id = 4711; tn->node_addr = 0; tn->trial_addr = 0; tn->addr_trial_end = 0; tn->capabilities = TIPC_NODE_CAPABILITIES; INIT_WORK(&tn->work, tipc_net_finalize_work); memset(tn->node_id, 0, sizeof(tn->node_id)); memset(tn->node_id_string, 0, sizeof(tn->node_id_string)); tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; get_random_bytes(&tn->random, sizeof(int)); INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); #ifdef CONFIG_TIPC_CRYPTO err = tipc_crypto_start(&tn->crypto_tx, net, NULL); if (err) goto out_crypto; #endif err = tipc_sk_rht_init(net); if (err) goto out_sk_rht; err = tipc_nametbl_init(net); if (err) goto out_nametbl; err = tipc_bcast_init(net); if (err) goto out_bclink; err = tipc_attach_loopback(net); if (err) goto out_bclink; return 0; out_bclink: tipc_nametbl_stop(net); out_nametbl: tipc_sk_rht_destroy(net); out_sk_rht: #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&tn->crypto_tx); out_crypto: #endif return err; } static void __net_exit tipc_exit_net(struct net *net) { struct tipc_net *tn = tipc_net(net); tipc_detach_loopback(net); tipc_net_stop(net); /* Make sure the tipc_net_finalize_work() finished */ cancel_work_sync(&tn->work); tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&tipc_net(net)->crypto_tx); #endif while (atomic_read(&tn->wq_count)) cond_resched(); } static void __net_exit tipc_pernet_pre_exit(struct net *net) { tipc_node_pre_cleanup_net(net); } static struct pernet_operations tipc_pernet_pre_exit_ops = { .pre_exit = tipc_pernet_pre_exit, }; static struct pernet_operations tipc_net_ops = { .init = tipc_init_net, .exit = tipc_exit_net, .id = &tipc_net_id, .size = sizeof(struct tipc_net), }; static struct pernet_operations tipc_topsrv_net_ops = { .init = tipc_topsrv_init_net, .exit = tipc_topsrv_exit_net, }; static int __init tipc_init(void) { int err; pr_info("Activated (version " TIPC_MOD_VER ")\n"); sysctl_tipc_rmem[0] = RCVBUF_MIN; sysctl_tipc_rmem[1] = RCVBUF_DEF; sysctl_tipc_rmem[2] = RCVBUF_MAX; err = tipc_register_sysctl(); if (err) goto out_sysctl; err = register_pernet_device(&tipc_net_ops); if (err) goto out_pernet; err = tipc_socket_init(); if (err) goto out_socket; err = register_pernet_device(&tipc_topsrv_net_ops); if (err) goto out_pernet_topsrv; err = register_pernet_subsys(&tipc_pernet_pre_exit_ops); if (err) goto out_register_pernet_subsys; err = tipc_bearer_setup(); if (err) goto out_bearer; err = tipc_netlink_start(); if (err) goto out_netlink; err = tipc_netlink_compat_start(); if (err) goto out_netlink_compat; pr_info("Started in single node mode\n"); return 0; out_netlink_compat: tipc_netlink_stop(); out_netlink: tipc_bearer_cleanup(); out_bearer: unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); out_register_pernet_subsys: unregister_pernet_device(&tipc_topsrv_net_ops); out_pernet_topsrv: tipc_socket_stop(); out_socket: unregister_pernet_device(&tipc_net_ops); out_pernet: tipc_unregister_sysctl(); out_sysctl: pr_err("Unable to start in single node mode\n"); return err; } static void __exit tipc_exit(void) { tipc_netlink_compat_stop(); tipc_netlink_stop(); tipc_bearer_cleanup(); unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); unregister_pernet_device(&tipc_topsrv_net_ops); tipc_socket_stop(); unregister_pernet_device(&tipc_net_ops); tipc_unregister_sysctl(); pr_info("Deactivated\n"); } module_init(tipc_init); module_exit(tipc_exit); MODULE_DESCRIPTION("TIPC: Transparent Inter Process Communication"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(TIPC_MOD_VER);
11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 // SPDX-License-Identifier: GPL-2.0 /* * seq_buf.c * * Copyright (C) 2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * * The seq_buf is a handy tool that allows you to pass a descriptor around * to a buffer that other functions can write to. It is similar to the * seq_file functionality but has some differences. * * To use it, the seq_buf must be initialized with seq_buf_init(). * This will set up the counters within the descriptor. You can call * seq_buf_init() more than once to reset the seq_buf to start * from scratch. */ #include <linux/bug.h> #include <linux/err.h> #include <linux/export.h> #include <linux/hex.h> #include <linux/minmax.h> #include <linux/printk.h> #include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/sprintf.h> #include <linux/string.h> #include <linux/types.h> #include <linux/uaccess.h> /** * seq_buf_can_fit - can the new data fit in the current buffer? * @s: the seq_buf descriptor * @len: The length to see if it can fit in the current buffer * * Returns: true if there's enough unused space in the seq_buf buffer * to fit the amount of new data according to @len. */ static bool seq_buf_can_fit(struct seq_buf *s, size_t len) { return s->len + len <= s->size; } /** * seq_buf_print_seq - move the contents of seq_buf into a seq_file * @m: the seq_file descriptor that is the destination * @s: the seq_buf descriptor that is the source. * * Returns: zero on success, non-zero otherwise. */ int seq_buf_print_seq(struct seq_file *m, struct seq_buf *s) { unsigned int len = seq_buf_used(s); return seq_write(m, s->buffer, len); } /** * seq_buf_vprintf - sequence printing of information. * @s: seq_buf descriptor * @fmt: printf format string * @args: va_list of arguments from a printf() type function * * Writes a vnprintf() format into the sequence buffer. * * Returns: zero on success, -1 on overflow. */ int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args) { int len; WARN_ON(s->size == 0); if (s->len < s->size) { len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args); if (s->len + len < s->size) { s->len += len; return 0; } } seq_buf_set_overflow(s); return -1; } /** * seq_buf_printf - sequence printing of information * @s: seq_buf descriptor * @fmt: printf format string * * Writes a printf() format into the sequence buffer. * * Returns: zero on success, -1 on overflow. */ int seq_buf_printf(struct seq_buf *s, const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = seq_buf_vprintf(s, fmt, ap); va_end(ap); return ret; } EXPORT_SYMBOL_GPL(seq_buf_printf); /** * seq_buf_do_printk - printk() seq_buf line by line * @s: seq_buf descriptor * @lvl: printk level * * printk()-s a multi-line sequential buffer line by line. The function * makes sure that the buffer in @s is NUL-terminated and safe to read * as a string. */ void seq_buf_do_printk(struct seq_buf *s, const char *lvl) { const char *start, *lf; if (s->size == 0 || s->len == 0) return; start = seq_buf_str(s); while ((lf = strchr(start, '\n'))) { int len = lf - start + 1; printk("%s%.*s", lvl, len, start); start = ++lf; } /* No trailing LF */ if (start < s->buffer + s->len) printk("%s%s\n", lvl, start); } EXPORT_SYMBOL_GPL(seq_buf_do_printk); #ifdef CONFIG_BINARY_PRINTF /** * seq_buf_bprintf - Write the printf string from binary arguments * @s: seq_buf descriptor * @fmt: The format string for the @binary arguments * @binary: The binary arguments for @fmt. * * When recording in a fast path, a printf may be recorded with just * saving the format and the arguments as they were passed to the * function, instead of wasting cycles converting the arguments into * ASCII characters. Instead, the arguments are saved in a 32 bit * word array that is defined by the format string constraints. * * This function will take the format and the binary array and finish * the conversion into the ASCII string within the buffer. * * Returns: zero on success, -1 on overflow. */ int seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary) { unsigned int len = seq_buf_buffer_left(s); int ret; WARN_ON(s->size == 0); if (s->len < s->size) { ret = bstr_printf(s->buffer + s->len, len, fmt, binary); if (s->len + ret < s->size) { s->len += ret; return 0; } } seq_buf_set_overflow(s); return -1; } #endif /* CONFIG_BINARY_PRINTF */ /** * seq_buf_puts - sequence printing of simple string * @s: seq_buf descriptor * @str: simple string to record * * Copy a simple string into the sequence buffer. * * Returns: zero on success, -1 on overflow. */ int seq_buf_puts(struct seq_buf *s, const char *str) { size_t len = strlen(str); WARN_ON(s->size == 0); /* Add 1 to len for the trailing null byte which must be there */ len += 1; if (seq_buf_can_fit(s, len)) { memcpy(s->buffer + s->len, str, len); /* Don't count the trailing null byte against the capacity */ s->len += len - 1; return 0; } seq_buf_set_overflow(s); return -1; } EXPORT_SYMBOL_GPL(seq_buf_puts); /** * seq_buf_putc - sequence printing of simple character * @s: seq_buf descriptor * @c: simple character to record * * Copy a single character into the sequence buffer. * * Returns: zero on success, -1 on overflow. */ int seq_buf_putc(struct seq_buf *s, unsigned char c) { WARN_ON(s->size == 0); if (seq_buf_can_fit(s, 1)) { s->buffer[s->len++] = c; return 0; } seq_buf_set_overflow(s); return -1; } EXPORT_SYMBOL_GPL(seq_buf_putc); /** * seq_buf_putmem - write raw data into the sequence buffer * @s: seq_buf descriptor * @mem: The raw memory to copy into the buffer * @len: The length of the raw memory to copy (in bytes) * * There may be cases where raw memory needs to be written into the * buffer and a strcpy() would not work. Using this function allows * for such cases. * * Returns: zero on success, -1 on overflow. */ int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len) { WARN_ON(s->size == 0); if (seq_buf_can_fit(s, len)) { memcpy(s->buffer + s->len, mem, len); s->len += len; return 0; } seq_buf_set_overflow(s); return -1; } #define MAX_MEMHEX_BYTES 8U #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) /** * seq_buf_putmem_hex - write raw memory into the buffer in ASCII hex * @s: seq_buf descriptor * @mem: The raw memory to write its hex ASCII representation of * @len: The length of the raw memory to copy (in bytes) * * This is similar to seq_buf_putmem() except instead of just copying the * raw memory into the buffer it writes its ASCII representation of it * in hex characters. * * Returns: zero on success, -1 on overflow. */ int seq_buf_putmem_hex(struct seq_buf *s, const void *mem, unsigned int len) { unsigned char hex[HEX_CHARS]; const unsigned char *data = mem; unsigned int start_len; int i, j; WARN_ON(s->size == 0); BUILD_BUG_ON(MAX_MEMHEX_BYTES * 2 >= HEX_CHARS); while (len) { start_len = min(len, MAX_MEMHEX_BYTES); #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < start_len; i++) { #else for (i = start_len-1, j = 0; i >= 0; i--) { #endif hex[j++] = hex_asc_hi(data[i]); hex[j++] = hex_asc_lo(data[i]); } if (WARN_ON_ONCE(j == 0 || j/2 > len)) break; /* j increments twice per loop */ hex[j++] = ' '; seq_buf_putmem(s, hex, j); if (seq_buf_has_overflowed(s)) return -1; len -= start_len; data += start_len; } return 0; } /** * seq_buf_path - copy a path into the sequence buffer * @s: seq_buf descriptor * @path: path to write into the sequence buffer. * @esc: set of characters to escape in the output * * Write a path name into the sequence buffer. * * Returns: the number of written bytes on success, -1 on overflow. */ int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc) { char *buf; size_t size = seq_buf_get_buf(s, &buf); int res = -1; WARN_ON(s->size == 0); if (size) { char *p = d_path(path, buf, size); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; } } seq_buf_commit(s, res); return res; } /** * seq_buf_to_user - copy the sequence buffer to user space * @s: seq_buf descriptor * @ubuf: The userspace memory location to copy to * @start: The first byte in the buffer to copy * @cnt: The amount to copy * * Copies the sequence buffer into the userspace memory pointed to * by @ubuf. It starts from @start and writes up to @cnt characters * or until it reaches the end of the content in the buffer (@s->len), * whichever comes first. * * Returns: * On success, it returns a positive number of the number of bytes * it copied. * * On failure it returns -EBUSY if all of the content in the * sequence has been already read, which includes nothing in the * sequence (@s->len == @start). * * Returns -EFAULT if the copy to userspace fails. */ int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, size_t start, int cnt) { int len; int ret; if (!cnt) return 0; len = seq_buf_used(s); if (len <= start) return -EBUSY; len -= start; if (cnt > len) cnt = len; ret = copy_to_user(ubuf, s->buffer + start, cnt); if (ret == cnt) return -EFAULT; return cnt - ret; } /** * seq_buf_hex_dump - print formatted hex dump into the sequence buffer * @s: seq_buf descriptor * @prefix_str: string to prefix each line with; * caller supplies trailing spaces for alignment if desired * @prefix_type: controls whether prefix of an offset, address, or none * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @buf: data blob to dump * @len: number of bytes in the @buf * @ascii: include ASCII after the hex output * * Function is an analogue of print_hex_dump() and thus has similar interface. * * linebuf size is maximal length for one line. * 32 * 3 - maximum bytes per line, each printed into 2 chars + 1 for * separating space * 2 - spaces separating hex dump and ASCII representation * 32 - ASCII representation * 1 - terminating '\0' * * Returns: zero on success, -1 on overflow. */ int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; unsigned char linebuf[32 * 3 + 2 + 32 + 1]; int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; for (i = 0; i < len; i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, linebuf, sizeof(linebuf), ascii); switch (prefix_type) { case DUMP_PREFIX_ADDRESS: ret = seq_buf_printf(s, "%s%p: %s\n", prefix_str, ptr + i, linebuf); break; case DUMP_PREFIX_OFFSET: ret = seq_buf_printf(s, "%s%.8x: %s\n", prefix_str, i, linebuf); break; default: ret = seq_buf_printf(s, "%s%s\n", prefix_str, linebuf); break; } if (ret) return ret; } return 0; }
15 15 15 8 21 17 3 12 9 21 21 19 2 40 7 41 41 9 7 9 9 6 9 5 5 1 1 9 9 5 5 1 2 1 2 1 1 4 4 7 7 7 7 1 7 3 3 3 3 3 3 3 62 62 39 40 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 // SPDX-License-Identifier: GPL-2.0 #include <linux/jhash.h> #include <linux/netfilter.h> #include <linux/rcupdate.h> #include <linux/rhashtable.h> #include <linux/vmalloc.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" struct ila_xlat_params { struct ila_params ip; int ifindex; }; struct ila_map { struct ila_xlat_params xp; struct rhash_head node; struct ila_map __rcu *next; struct rcu_head rcu; }; #define MAX_LOCKS 1024 #define LOCKS_PER_CPU 10 static int alloc_ila_locks(struct ila_net *ilan) { return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask, MAX_LOCKS, LOCKS_PER_CPU, GFP_KERNEL); } static u32 hashrnd __read_mostly; static __always_inline void __ila_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } static inline u32 ila_locator_hash(struct ila_locator loc) { u32 *v = (u32 *)loc.v32; __ila_hash_secret_init(); return jhash_2words(v[0], v[1], hashrnd); } static inline spinlock_t *ila_get_lock(struct ila_net *ilan, struct ila_locator loc) { return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask]; } static inline int ila_cmp_wildcards(struct ila_map *ila, struct ila_addr *iaddr, int ifindex) { return (ila->xp.ifindex && ila->xp.ifindex != ifindex); } static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *xp) { return (ila->xp.ifindex != xp->ifindex); } static int ila_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct ila_map *ila = obj; return (ila->xp.ip.locator_match.v64 != *(__be64 *)arg->key); } static inline int ila_order(struct ila_map *ila) { int score = 0; if (ila->xp.ifindex) score += 1 << 1; return score; } static const struct rhashtable_params rht_params = { .nelem_hint = 1024, .head_offset = offsetof(struct ila_map, node), .key_offset = offsetof(struct ila_map, xp.ip.locator_match), .key_len = sizeof(u64), /* identifier */ .max_size = 1048576, .min_size = 256, .automatic_shrinking = true, .obj_cmpfn = ila_cmpfn, }; static int parse_nl_config(struct genl_info *info, struct ila_xlat_params *xp) { memset(xp, 0, sizeof(*xp)); if (info->attrs[ILA_ATTR_LOCATOR]) xp->ip.locator.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR]); if (info->attrs[ILA_ATTR_LOCATOR_MATCH]) xp->ip.locator_match.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR_MATCH]); xp->ip.csum_mode = nla_get_u8_default(info->attrs[ILA_ATTR_CSUM_MODE], ILA_CSUM_NO_ACTION); xp->ip.ident_type = nla_get_u8_default(info->attrs[ILA_ATTR_IDENT_TYPE], ILA_ATYPE_USE_FORMAT); if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); return 0; } /* Must be called with rcu readlock */ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr, int ifindex, struct ila_net *ilan) { struct ila_map *ila; ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc, rht_params); while (ila) { if (!ila_cmp_wildcards(ila, iaddr, ifindex)) return ila; ila = rcu_access_pointer(ila->next); } return NULL; } /* Must be called with rcu readlock */ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp, struct ila_net *ilan) { struct ila_map *ila; ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); while (ila) { if (!ila_cmp_params(ila, xp)) return ila; ila = rcu_access_pointer(ila->next); } return NULL; } static inline void ila_release(struct ila_map *ila) { kfree_rcu(ila, rcu); } static void ila_free_node(struct ila_map *ila) { struct ila_map *next; /* Assume rcu_readlock held */ while (ila) { next = rcu_access_pointer(ila->next); ila_release(ila); ila = next; } } static void ila_free_cb(void *ptr, void *arg) { ila_free_node((struct ila_map *)ptr); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); static unsigned int ila_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { ila_xlat_addr(skb, false); return NF_ACCEPT; } static const struct nf_hook_ops ila_nf_hook_ops[] = { { .hook = ila_nf_input, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = -1, }, }; static DEFINE_MUTEX(ila_mutex); static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head; spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; if (!READ_ONCE(ilan->xlat.hooks_registered)) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ mutex_lock(&ila_mutex); if (!ilan->xlat.hooks_registered) { err = nf_register_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); if (!err) WRITE_ONCE(ilan->xlat.hooks_registered, true); } mutex_unlock(&ila_mutex); if (err) return err; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); if (!ila) return -ENOMEM; ila_init_saved_csum(&xp->ip); ila->xp = *xp; order = ila_order(ila); spin_lock(lock); head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); if (!head) { /* New entry for the rhash_table */ err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); } else { struct ila_map *tila = head, *prev = NULL; do { if (!ila_cmp_params(tila, xp)) { err = -EEXIST; goto out; } if (order > ila_order(tila)) break; prev = tila; tila = rcu_dereference_protected(tila->next, lockdep_is_held(lock)); } while (tila); if (prev) { /* Insert in sub list of head */ RCU_INIT_POINTER(ila->next, tila); rcu_assign_pointer(prev->next, ila); } else { /* Make this ila new head */ RCU_INIT_POINTER(ila->next, head); err = rhashtable_replace_fast(&ilan->xlat.rhash_table, &head->node, &ila->node, rht_params); if (err) goto out; } } out: spin_unlock(lock); if (err) kfree(ila); return err; } static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head, *prev; spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = -ENOENT; spin_lock(lock); head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); ila = head; prev = NULL; while (ila) { if (ila_cmp_params(ila, xp)) { prev = ila; ila = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); continue; } err = 0; if (prev) { /* Not head, just delete from list */ rcu_assign_pointer(prev->next, ila->next); } else { /* It is the head. If there is something in the * sublist we need to make a new head. */ head = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); if (head) { /* Put first entry in the sublist into the * table */ err = rhashtable_replace_fast( &ilan->xlat.rhash_table, &ila->node, &head->node, rht_params); if (err) goto out; } else { /* Entry no longer used */ err = rhashtable_remove_fast( &ilan->xlat.rhash_table, &ila->node, rht_params); } } ila_release(ila); break; } out: spin_unlock(lock); return err; } int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params p; int err; err = parse_nl_config(info, &p); if (err) return err; return ila_add_mapping(net, &p); } int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params xp; int err; err = parse_nl_config(info, &xp); if (err) return err; ila_del_mapping(net, &xp); return 0; } static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan, struct ila_map *ila) { return ila_get_lock(ilan, ila->xp.ip.locator_match); } int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct rhashtable_iter iter; struct ila_map *ila; spinlock_t *lock; int ret = 0; rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter); rhashtable_walk_start(&iter); for (;;) { ila = rhashtable_walk_next(&iter); if (IS_ERR(ila)) { if (PTR_ERR(ila) == -EAGAIN) continue; ret = PTR_ERR(ila); goto done; } else if (!ila) { break; } lock = lock_from_ila_map(ilan, ila); spin_lock(lock); ret = rhashtable_remove_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); if (!ret) ila_free_node(ila); spin_unlock(lock); if (ret) break; } done: rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); return ret; } static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) { if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, (__force u64)ila->xp.ip.locator.v64, ILA_ATTR_PAD) || nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR_MATCH, (__force u64)ila->xp.ip.locator_match.v64, ILA_ATTR_PAD) || nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode) || nla_put_u8(msg, ILA_ATTR_IDENT_TYPE, ila->xp.ip.ident_type)) return -1; return 0; } static int ila_dump_info(struct ila_map *ila, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &ila_nl_family, flags, cmd); if (!hdr) return -ENOMEM; if (ila_fill_info(ila, skb) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct sk_buff *msg; struct ila_xlat_params xp; struct ila_map *ila; int ret; ret = parse_nl_config(info, &xp); if (ret) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rcu_read_lock(); ret = -ESRCH; ila = ila_lookup_by_params(&xp, ilan); if (ila) { ret = ila_dump_info(ila, info->snd_portid, info->snd_seq, 0, msg, info->genlhdr->cmd); } rcu_read_unlock(); if (ret < 0) goto out_free; return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); return ret; } struct ila_dump_iter { struct rhashtable_iter rhiter; int skip; }; int ila_xlat_nl_dump_start(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_dump_iter *iter; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter->rhiter); iter->skip = 0; cb->args[0] = (long)iter; return 0; } int ila_xlat_nl_dump_done(struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; rhashtable_walk_exit(&iter->rhiter); kfree(iter); return 0; } int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; struct rhashtable_iter *rhiter = &iter->rhiter; int skip = iter->skip; struct ila_map *ila; int ret; rhashtable_walk_start(rhiter); /* Get first entry */ ila = rhashtable_walk_peek(rhiter); if (ila && !IS_ERR(ila) && skip) { /* Skip over visited entries */ while (ila && skip) { /* Skip over any ila entries in this list that we * have already dumped. */ ila = rcu_access_pointer(ila->next); skip--; } } skip = 0; for (;;) { if (IS_ERR(ila)) { ret = PTR_ERR(ila); if (ret == -EAGAIN) { /* Table has changed and iter has reset. Return * -EAGAIN to the application even if we have * written data to the skb. The application * needs to deal with this. */ goto out_ret; } else { break; } } else if (!ila) { ret = 0; break; } while (ila) { ret = ila_dump_info(ila, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, ILA_CMD_GET); if (ret) goto out; skip++; ila = rcu_access_pointer(ila->next); } skip = 0; ila = rhashtable_walk_next(rhiter); } out: iter->skip = skip; ret = (skb->len ? : ret); out_ret: rhashtable_walk_stop(rhiter); return ret; } int ila_xlat_init_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); int err; err = alloc_ila_locks(ilan); if (err) return err; err = rhashtable_init(&ilan->xlat.rhash_table, &rht_params); if (err) { free_bucket_spinlocks(ilan->xlat.locks); return err; } return 0; } void ila_xlat_pre_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); if (ilan->xlat.hooks_registered) nf_unregister_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); } void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); free_bucket_spinlocks(ilan->xlat.locks); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); /* Assumes skb contains a valid IPv6 header that is pulled */ /* No check here that ILA type in the mapping matches what is in the * address. We assume that whatever sender gaves us can be translated. * The checksum mode however is relevant. */ rcu_read_lock(); ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) ila_update_ipv6_locator(skb, &ila->xp.ip, sir2ila); rcu_read_unlock(); return 0; }
53 53 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 // SPDX-License-Identifier: GPL-2.0-or-later /* * cgroups support for the BFQ I/O scheduler. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/cgroup.h> #include <linux/ktime.h> #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/sbitmap.h> #include <linux/delay.h> #include "elevator.h" #include "bfq-iosched.h" #ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp) { int ret; ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); if (ret) return ret; atomic64_set(&stat->aux_cnt, 0); return 0; } static void bfq_stat_exit(struct bfq_stat *stat) { percpu_counter_destroy(&stat->cpu_cnt); } /** * bfq_stat_add - add a value to a bfq_stat * @stat: target bfq_stat * @val: value to add * * Add @val to @stat. The caller must ensure that IRQ on the same CPU * don't re-enter this function for the same counter. */ static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val) { percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); } /** * bfq_stat_read - read the current value of a bfq_stat * @stat: bfq_stat to read */ static inline uint64_t bfq_stat_read(struct bfq_stat *stat) { return percpu_counter_sum_positive(&stat->cpu_cnt); } /** * bfq_stat_reset - reset a bfq_stat * @stat: bfq_stat to reset */ static inline void bfq_stat_reset(struct bfq_stat *stat) { percpu_counter_set(&stat->cpu_cnt, 0); atomic64_set(&stat->aux_cnt, 0); } /** * bfq_stat_add_aux - add a bfq_stat into another's aux count * @to: the destination bfq_stat * @from: the source * * Add @from's count including the aux one to @to's aux count. */ static inline void bfq_stat_add_aux(struct bfq_stat *to, struct bfq_stat *from) { atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt), &to->aux_cnt); } /** * blkg_prfill_stat - prfill callback for bfq_stat * @sf: seq_file to print to * @pd: policy private data of interest * @off: offset to the bfq_stat in @pd * * prfill callback for printing a bfq_stat. */ static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) { return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off)); } /* bfqg stats flags */ enum bfqg_stats_flags { BFQG_stats_waiting = 0, BFQG_stats_idling, BFQG_stats_empty, }; #define BFQG_FLAG_FNS(name) \ static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ { \ stats->flags |= (1 << BFQG_stats_##name); \ } \ static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ { \ stats->flags &= ~(1 << BFQG_stats_##name); \ } \ static int bfqg_stats_##name(struct bfqg_stats *stats) \ { \ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ } \ BFQG_FLAG_FNS(waiting) BFQG_FLAG_FNS(idling) BFQG_FLAG_FNS(empty) #undef BFQG_FLAG_FNS /* This should be called with the scheduler lock held. */ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) { u64 now; if (!bfqg_stats_waiting(stats)) return; now = blk_time_get_ns(); if (now > stats->start_group_wait_time) bfq_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); bfqg_stats_clear_waiting(stats); } /* This should be called with the scheduler lock held. */ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, struct bfq_group *curr_bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_waiting(stats)) return; if (bfqg == curr_bfqg) return; stats->start_group_wait_time = blk_time_get_ns(); bfqg_stats_mark_waiting(stats); } /* This should be called with the scheduler lock held. */ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { u64 now; if (!bfqg_stats_empty(stats)) return; now = blk_time_get_ns(); if (now > stats->start_empty_time) bfq_stat_add(&stats->empty_time, now - stats->start_empty_time); bfqg_stats_clear_empty(stats); } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { bfq_stat_add(&bfqg->stats.dequeue, 1); } void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (blkg_rwstat_total(&stats->queued)) return; /* * group is already marked empty. This can happen if bfqq got new * request in parent group and moved to this group while being added * to service tree. Just ignore the event and move on. */ if (bfqg_stats_empty(stats)) return; stats->start_empty_time = blk_time_get_ns(); bfqg_stats_mark_empty(stats); } void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_idling(stats)) { u64 now = blk_time_get_ns(); if (now > stats->start_idle_time) bfq_stat_add(&stats->idle_time, now - stats->start_idle_time); bfqg_stats_clear_idling(stats); } } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; stats->start_idle_time = blk_time_get_ns(); bfqg_stats_mark_idling(stats); } void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; bfq_stat_add(&stats->avg_queue_size_sum, blkg_rwstat_total(&stats->queued)); bfq_stat_add(&stats->avg_queue_size_samples, 1); bfqg_stats_update_group_wait_time(stats); } void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.queued, opf, 1); bfqg_stats_end_empty_time(&bfqg->stats); if (!(bfqq == bfqg->bfqd->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.queued, opf, -1); } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.merged, opf, 1); } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { struct bfqg_stats *stats = &bfqg->stats; u64 now = blk_time_get_ns(); if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, opf, now - io_start_time_ns); if (io_start_time_ns > start_time_ns) blkg_rwstat_add(&stats->wait_time, opf, io_start_time_ns - start_time_ns); } #else /* CONFIG_BFQ_CGROUP_DEBUG */ void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ #ifdef CONFIG_BFQ_GROUP_IOSCHED /* * blk-cgroup policy-related handlers * The following functions help in converting between blk-cgroup * internal structures and BFQ-specific structures. */ static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct bfq_group, pd) : NULL; } struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) { return pd_to_blkg(&bfqg->pd); } static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) { return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq)); } /* * bfq_group handlers * The following functions help in navigating the bfq_group hierarchy * by allowing to find the parent of a bfq_group or the bfq_group * associated to a bfq_queue. */ static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) { struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; return pblkg ? blkg_to_bfqg(pblkg) : NULL; } struct bfq_group *bfqq_group(struct bfq_queue *bfqq) { struct bfq_entity *group_entity = bfqq->entity.parent; return group_entity ? container_of(group_entity, struct bfq_group, entity) : bfqq->bfqd->root_group; } /* * The following two functions handle get and put of a bfq_group by * wrapping the related blk-cgroup hooks. */ static void bfqg_get(struct bfq_group *bfqg) { refcount_inc(&bfqg->ref); } static void bfqg_put(struct bfq_group *bfqg) { if (refcount_dec_and_test(&bfqg->ref)) kfree(bfqg); } static void bfqg_and_blkg_get(struct bfq_group *bfqg) { /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ bfqg_get(bfqg); blkg_get(bfqg_to_blkg(bfqg)); } void bfqg_and_blkg_put(struct bfq_group *bfqg) { blkg_put(bfqg_to_blkg(bfqg)); bfqg_put(bfqg); } void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) { struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); if (!bfqg) return; blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); } /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { #ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); bfq_stat_reset(&stats->time); bfq_stat_reset(&stats->avg_queue_size_sum); bfq_stat_reset(&stats->avg_queue_size_samples); bfq_stat_reset(&stats->dequeue); bfq_stat_reset(&stats->group_wait_time); bfq_stat_reset(&stats->idle_time); bfq_stat_reset(&stats->empty_time); #endif } /* @to += @from */ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) { if (!to || !from) return; #ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); bfq_stat_add_aux(&from->time, &from->time); bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); bfq_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); bfq_stat_add_aux(&to->dequeue, &from->dequeue); bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time); bfq_stat_add_aux(&to->idle_time, &from->idle_time); bfq_stat_add_aux(&to->empty_time, &from->empty_time); #endif } /* * Transfer @bfqg's stats to its parent's aux counts so that the ancestors' * recursive stats can still account for the amount used by this bfqg after * it's gone. */ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) { struct bfq_group *parent; if (!bfqg) /* root_group */ return; parent = bfqg_parent(bfqg); lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock); if (unlikely(!parent)) return; bfqg_stats_add_aux(&parent->stats, &bfqg->stats); bfqg_stats_reset(&bfqg->stats); } void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->weight = entity->new_weight; entity->orig_weight = entity->new_weight; if (bfqq) { bfqq->ioprio = bfqq->new_ioprio; bfqq->ioprio_class = bfqq->new_ioprio_class; /* * Make sure that bfqg and its associated blkg do not * disappear before entity. */ bfqg_and_blkg_get(bfqg); } entity->parent = bfqg->my_entity; /* NULL for root group */ entity->sched_data = &bfqg->sched_data; } static void bfqg_stats_exit(struct bfqg_stats *stats) { blkg_rwstat_exit(&stats->bytes); blkg_rwstat_exit(&stats->ios); #ifdef CONFIG_BFQ_CGROUP_DEBUG blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); blkg_rwstat_exit(&stats->queued); bfq_stat_exit(&stats->time); bfq_stat_exit(&stats->avg_queue_size_sum); bfq_stat_exit(&stats->avg_queue_size_samples); bfq_stat_exit(&stats->dequeue); bfq_stat_exit(&stats->group_wait_time); bfq_stat_exit(&stats->idle_time); bfq_stat_exit(&stats->empty_time); #endif } static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { if (blkg_rwstat_init(&stats->bytes, gfp) || blkg_rwstat_init(&stats->ios, gfp)) goto error; #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || bfq_stat_init(&stats->time, gfp) || bfq_stat_init(&stats->avg_queue_size_sum, gfp) || bfq_stat_init(&stats->avg_queue_size_samples, gfp) || bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) || bfq_stat_init(&stats->empty_time, gfp)) goto error; #endif return 0; error: bfqg_stats_exit(stats); return -ENOMEM; } static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) { return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; } static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) { return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); } static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) { struct bfq_group_data *bgd; bgd = kzalloc(sizeof(*bgd), gfp); if (!bgd) return NULL; bgd->weight = CGROUP_WEIGHT_DFL; return &bgd->pd; } static void bfq_cpd_free(struct blkcg_policy_data *cpd) { kfree(cpd_to_bfqgd(cpd)); } static struct blkg_policy_data *bfq_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct bfq_group *bfqg; bfqg = kzalloc_node(sizeof(*bfqg), gfp, disk->node_id); if (!bfqg) return NULL; if (bfqg_stats_init(&bfqg->stats, gfp)) { kfree(bfqg); return NULL; } /* see comments in bfq_bic_update_cgroup for why refcounting */ refcount_set(&bfqg->ref, 1); return &bfqg->pd; } static void bfq_pd_init(struct blkg_policy_data *pd) { struct blkcg_gq *blkg = pd_to_blkg(pd); struct bfq_group *bfqg = blkg_to_bfqg(blkg); struct bfq_data *bfqd = blkg->q->elevator->elevator_data; struct bfq_entity *entity = &bfqg->entity; struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; entity->last_bfqq_created = NULL; bfqg->my_entity = entity; /* * the root_group's will be set to NULL * in bfq_init_queue() */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; bfqg->num_queues_with_pending_reqs = 0; bfqg->rq_pos_tree = RB_ROOT; } static void bfq_pd_free(struct blkg_policy_data *pd) { struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_exit(&bfqg->stats); bfqg_put(bfqg); } static void bfq_pd_reset_stats(struct blkg_policy_data *pd) { struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_reset(&bfqg->stats); } static void bfq_group_set_parent(struct bfq_group *bfqg, struct bfq_group *parent) { struct bfq_entity *entity; entity = &bfqg->entity; entity->parent = parent->my_entity; entity->sched_data = &parent->sched_data; } static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg) { struct bfq_group *parent; struct bfq_entity *entity; /* * Update chain of bfq_groups as we might be handling a leaf group * which, along with some of its relatives, has not been hooked yet * to the private hierarchy of BFQ. */ entity = &bfqg->entity; for_each_entity(entity) { struct bfq_group *curr_bfqg = container_of(entity, struct bfq_group, entity); if (curr_bfqg != bfqd->root_group) { parent = bfqg_parent(curr_bfqg); if (!parent) parent = bfqd->root_group; bfq_group_set_parent(curr_bfqg, parent); } } } struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; struct bfq_group *bfqg; while (blkg) { if (!blkg->online) { blkg = blkg->parent; continue; } bfqg = blkg_to_bfqg(blkg); if (bfqg->pd.online) { bio_associate_blkg_from_css(bio, &blkg->blkcg->css); return bfqg; } blkg = blkg->parent; } bio_associate_blkg_from_css(bio, &bfqg_to_blkg(bfqd->root_group)->blkcg->css); return bfqd->root_group; } /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. * @bfqq: the queue to move. * @bfqg: the group to move to. * * Move @bfqq to @bfqg, deactivating it from its old group and reactivating * it on the new one. Avoid putting the entity on the old group idle tree. * * Must be called under the scheduler lock, to make sure that the blkg * owning @bfqg does not disappear (see comments in * bfq_bic_update_cgroup on guaranteeing the consistency of blkg * objects). */ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) { struct bfq_entity *entity = &bfqq->entity; struct bfq_group *old_parent = bfqq_group(bfqq); bool has_pending_reqs = false; /* * No point to move bfqq to the same group, which can happen when * root group is offlined */ if (old_parent == bfqg) return; /* * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group * until elevator exit. */ if (bfqq == &bfqd->oom_bfqq) return; /* * Get extra reference to prevent bfqq from being freed in * next possible expire or deactivate. */ bfqq->ref++; if (entity->in_groups_with_pending_reqs) { has_pending_reqs = true; bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); } /* If bfqq is empty, then bfq_bfqq_expire also invokes * bfq_del_bfqq_busy, thereby removing bfqq and its entity * from data structures related to current group. Otherwise we * need to remove bfqq explicitly with bfq_deactivate_bfqq, as * we do below. */ if (bfqq == bfqd->in_service_queue) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); if (bfq_bfqq_busy(bfqq)) bfq_deactivate_bfqq(bfqd, bfqq, false, false); else if (entity->on_st_or_in_serv) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfqg_and_blkg_put(old_parent); bfq_reassign_last_bfqq(bfqq, NULL); entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; /* pin down bfqg and its associated blkg */ bfqg_and_blkg_get(bfqg); if (has_pending_reqs) bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); if (bfq_bfqq_busy(bfqq)) { if (unlikely(!bfqd->nonrot_with_queueing)) bfq_pos_tree_add_move(bfqd, bfqq); bfq_activate_bfqq(bfqd, bfqq); } if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); /* release extra ref taken above, bfqq may happen to be freed now */ bfq_put_queue(bfqq); } static void bfq_sync_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *sync_bfqq, struct bfq_io_cq *bic, struct bfq_group *bfqg, unsigned int act_idx) { struct bfq_queue *bfqq; if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { /* We are the only user of this bfqq, just move it */ if (sync_bfqq->entity.sched_data != &bfqg->sched_data) bfq_bfqq_move(bfqd, sync_bfqq, bfqg); return; } /* * The queue was merged to a different queue. Check * that the merge chain still belongs to the same * cgroup. */ for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) if (bfqq->entity.sched_data != &bfqg->sched_data) break; if (bfqq) { /* * Some queue changed cgroup so the merge is not valid * anymore. We cannot easily just cancel the merge (by * clearing new_bfqq) as there may be other processes * using this queue and holding refs to all queues * below sync_bfqq->new_bfqq. Similarly if the merge * already happened, we need to detach from bfqq now * so that we cannot merge bio to a request from the * old cgroup. */ bfq_put_cooperator(sync_bfqq); bic_set_bfqq(bic, NULL, true, act_idx); bfq_release_process_ref(bfqd, sync_bfqq); } } /** * __bfq_bic_change_cgroup - move @bic to @bfqg. * @bfqd: the queue descriptor. * @bic: the bic to move. * @bfqg: the group to move to. * * Move bic to blkcg, assuming that bfqd->lock is held; which makes * sure that the reference to cgroup is valid across the call (see * comments in bfq_bic_update_cgroup on this issue) */ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_group *bfqg) { unsigned int act_idx; for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) { struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx); struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx); if (async_bfqq && async_bfqq->entity.sched_data != &bfqg->sched_data) { bic_set_bfqq(bic, NULL, false, act_idx); bfq_release_process_ref(bfqd, async_bfqq); } if (sync_bfqq) bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx); } } void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); struct bfq_group *bfqg = bfq_bio_bfqg(bfqd, bio); uint64_t serial_nr; serial_nr = bfqg_to_blkg(bfqg)->blkcg->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) return; /* * New cgroup for this process. Make sure it is linked to bfq internal * cgroup hierarchy. */ bfq_link_bfqg(bfqd, bfqg); __bfq_bic_change_cgroup(bfqd, bic, bfqg); bic->blkcg_serial_nr = serial_nr; } /** * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. * @st: the service tree being flushed. */ static void bfq_flush_idle_tree(struct bfq_service_tree *st) { struct bfq_entity *entity = st->first_idle; for (; entity ; entity = st->first_idle) __bfq_deactivate_entity(entity, false); } /** * bfq_reparent_leaf_entity - move leaf entity to the root_group. * @bfqd: the device data structure with the root group. * @entity: the entity to move, if entity is a leaf; or the parent entity * of an active leaf entity to move, if entity is not a leaf. * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, struct bfq_entity *entity, int ioprio_class) { struct bfq_queue *bfqq; struct bfq_entity *child_entity = entity; while (child_entity->my_sched_data) { /* leaf not reached yet */ struct bfq_sched_data *child_sd = child_entity->my_sched_data; struct bfq_service_tree *child_st = child_sd->service_tree + ioprio_class; struct rb_root *child_active = &child_st->active; child_entity = bfq_entity_of(rb_first(child_active)); if (!child_entity) child_entity = child_sd->in_service_entity; } bfqq = bfq_entity_to_bfqq(child_entity); bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); } /** * bfq_reparent_active_queues - move to the root group all active queues. * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. * @st: the service tree to start the search from. * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_active_queues(struct bfq_data *bfqd, struct bfq_group *bfqg, struct bfq_service_tree *st, int ioprio_class) { struct rb_root *active = &st->active; struct bfq_entity *entity; while ((entity = bfq_entity_of(rb_first(active)))) bfq_reparent_leaf_entity(bfqd, entity, ioprio_class); if (bfqg->sched_data.in_service_entity) bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.in_service_entity, ioprio_class); } /** * bfq_pd_offline - deactivate the entity associated with @pd, * and reparent its children entities. * @pd: descriptor of the policy going offline. * * blkio already grabs the queue_lock for us, so no need to use * RCU-based magic */ static void bfq_pd_offline(struct blkg_policy_data *pd) { struct bfq_service_tree *st; struct bfq_group *bfqg = pd_to_bfqg(pd); struct bfq_data *bfqd = bfqg->bfqd; struct bfq_entity *entity = bfqg->my_entity; unsigned long flags; int i; spin_lock_irqsave(&bfqd->lock, flags); if (!entity) /* root group */ goto put_async_queues; /* * Empty all service_trees belonging to this group before * deactivating the group itself. */ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { st = bfqg->sched_data.service_tree + i; /* * It may happen that some queues are still active * (busy) upon group destruction (if the corresponding * processes have been forced to terminate). We move * all the leaf entities corresponding to these queues * to the root_group. * Also, it may happen that the group has an entity * in service, which is disconnected from the active * tree: it must be moved, too. * There is no need to put the sync queues, as the * scheduler has taken no reference. */ bfq_reparent_active_queues(bfqd, bfqg, st, i); /* * The idle tree may still contain bfq_queues * belonging to exited task because they never * migrated to a different cgroup from the one being * destroyed now. In addition, even * bfq_reparent_active_queues() may happen to add some * entities to the idle tree. It happens if, in some * of the calls to bfq_bfqq_move() performed by * bfq_reparent_active_queues(), the queue to move is * empty and gets expired. */ bfq_flush_idle_tree(st); } __bfq_deactivate_entity(entity, false); put_async_queues: bfq_put_async_queues(bfqd, bfqg); spin_unlock_irqrestore(&bfqd->lock, flags); /* * @blkg is going offline and will be ignored by * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so * that they don't get lost. If IOs complete after this point, the * stats for them will be lost. Oh well... */ bfqg_stats_xfer_dead(bfqg); } void bfq_end_wr_async(struct bfq_data *bfqd) { struct blkcg_gq *blkg; list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); bfq_end_wr_async_queues(bfqd, bfqg); } bfq_end_wr_async_queues(bfqd, bfqd->root_group); } static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); unsigned int val = 0; if (bfqgd) val = bfqgd->weight; seq_printf(sf, "%u\n", val); return 0; } static u64 bfqg_prfill_weight_device(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = pd_to_bfqg(pd); if (!bfqg->entity.dev_weight) return 0; return __blkg_prfill_u64(sf, pd, bfqg->entity.dev_weight); } static int bfq_io_show_weight(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); seq_printf(sf, "default %u\n", bfqgd->weight); blkcg_print_blkgs(sf, blkcg, bfqg_prfill_weight_device, &blkcg_policy_bfq, 0, false); return 0; } static void bfq_group_set_weight(struct bfq_group *bfqg, u64 weight, u64 dev_weight) { weight = dev_weight ?: weight; bfqg->entity.dev_weight = dev_weight; /* * Setting the prio_changed flag of the entity * to 1 with new_weight == weight would re-set * the value of the weight to its ioprio mapping. * Set the flag only if necessary. */ if ((unsigned short)weight != bfqg->entity.new_weight) { bfqg->entity.new_weight = (unsigned short)weight; /* * Make sure that the above new value has been * stored in bfqg->entity.new_weight before * setting the prio_changed flag. In fact, * this flag may be read asynchronously (in * critical sections protected by a different * lock than that held here), and finding this * flag set may cause the execution of the code * for updating parameters whose value may * depend also on bfqg->entity.new_weight (in * __bfq_entity_update_weight_prio). * This barrier makes sure that the new value * of bfqg->entity.new_weight is correctly * seen in that code. */ smp_wmb(); bfqg->entity.prio_changed = 1; } } static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); struct blkcg_gq *blkg; int ret = -ERANGE; if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) return ret; ret = 0; spin_lock_irq(&blkcg->lock); bfqgd->weight = (unsigned short)val; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); if (bfqg) bfq_group_set_weight(bfqg, val, 0); } spin_unlock_irq(&blkcg->lock); return ret; } static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int ret; struct blkg_conf_ctx ctx; struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct bfq_group *bfqg; u64 v; blkg_conf_init(&ctx, buf); ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx); if (ret) goto out; if (sscanf(ctx.body, "%llu", &v) == 1) { /* require "default" on dfl */ ret = -ERANGE; if (!v) goto out; } else if (!strcmp(strim(ctx.body), "default")) { v = 0; } else { ret = -EINVAL; goto out; } bfqg = blkg_to_bfqg(ctx.blkg); ret = -ERANGE; if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) { bfq_group_set_weight(bfqg, bfqg->entity.weight, v); ret = 0; } out: blkg_conf_exit(&ctx); return ret ?: nbytes; } static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { char *endp; int ret; u64 v; buf = strim(buf); /* "WEIGHT" or "default WEIGHT" sets the default weight */ v = simple_strtoull(buf, &endp, 0); if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { ret = bfq_io_set_weight_legacy(of_css(of), NULL, v); return ret ?: nbytes; } return bfq_io_set_device_weight(of, buf, nbytes, off); } static int bfqg_print_rwstat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, &blkcg_policy_bfq, seq_cft(sf)->private, true); return 0; } static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample sum; blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); } static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, seq_cft(sf)->private, true); return 0; } #ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfqg_print_stat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, &blkcg_policy_bfq, seq_cft(sf)->private, false); return 0; } static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkcg_gq *blkg = pd_to_blkg(pd); struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; u64 sum = 0; lockdep_assert_held(&blkg->q->queue_lock); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { struct bfq_stat *stat; if (!pos_blkg->online) continue; stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off; sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); return __blkg_prfill_u64(sf, pd, sum); } static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_stat_recursive, &blkcg_policy_bfq, seq_cft(sf)->private, false); return 0; } static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg); u64 sum = blkg_rwstat_total(&bfqg->stats.bytes); return __blkg_prfill_u64(sf, pd, sum >> 9); } static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); return 0; } static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample tmp; blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq, offsetof(struct bfq_group, stats.bytes), &tmp); return __blkg_prfill_u64(sf, pd, (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); } static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, false); return 0; } static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = pd_to_bfqg(pd); u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples); u64 v = 0; if (samples) { v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum); v = div64_u64(v, samples); } __blkg_prfill_u64(sf, pd, v); return 0; } /* print avg_queue_size */ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, 0, false); return 0; } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { int ret; ret = blkcg_activate_policy(bfqd->queue->disk, &blkcg_policy_bfq); if (ret) return NULL; return blkg_to_bfqg(bfqd->queue->root_blkg); } struct blkcg_policy blkcg_policy_bfq = { .dfl_cftypes = bfq_blkg_files, .legacy_cftypes = bfq_blkcg_legacy_files, .cpd_alloc_fn = bfq_cpd_alloc, .cpd_free_fn = bfq_cpd_free, .pd_alloc_fn = bfq_pd_alloc, .pd_init_fn = bfq_pd_init, .pd_offline_fn = bfq_pd_offline, .pd_free_fn = bfq_pd_free, .pd_reset_stats_fn = bfq_pd_reset_stats, }; struct cftype bfq_blkcg_legacy_files[] = { { .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight_legacy, .write_u64 = bfq_io_set_weight_legacy, }, { .name = "bfq.weight_device", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight, .write = bfq_io_set_weight, }, /* statistics, covers only the tasks in the bfqg */ { .name = "bfq.io_service_bytes", .private = offsetof(struct bfq_group, stats.bytes), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_serviced", .private = offsetof(struct bfq_group, stats.ios), .seq_show = bfqg_print_rwstat, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), .seq_show = bfqg_print_stat, }, { .name = "bfq.sectors", .seq_show = bfqg_print_stat_sectors, }, { .name = "bfq.io_service_time", .private = offsetof(struct bfq_group, stats.service_time), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_wait_time", .private = offsetof(struct bfq_group, stats.wait_time), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_merged", .private = offsetof(struct bfq_group, stats.merged), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_queued", .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat, }, #endif /* CONFIG_BFQ_CGROUP_DEBUG */ /* the same statistics which cover the bfqg and its descendants */ { .name = "bfq.io_service_bytes_recursive", .private = offsetof(struct bfq_group, stats.bytes), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_serviced_recursive", .private = offsetof(struct bfq_group, stats.ios), .seq_show = bfqg_print_rwstat_recursive, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time_recursive", .private = offsetof(struct bfq_group, stats.time), .seq_show = bfqg_print_stat_recursive, }, { .name = "bfq.sectors_recursive", .seq_show = bfqg_print_stat_sectors_recursive, }, { .name = "bfq.io_service_time_recursive", .private = offsetof(struct bfq_group, stats.service_time), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_wait_time_recursive", .private = offsetof(struct bfq_group, stats.wait_time), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_merged_recursive", .private = offsetof(struct bfq_group, stats.merged), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_queued_recursive", .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.avg_queue_size", .seq_show = bfqg_print_avg_queue_size, }, { .name = "bfq.group_wait_time", .private = offsetof(struct bfq_group, stats.group_wait_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.idle_time", .private = offsetof(struct bfq_group, stats.idle_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.empty_time", .private = offsetof(struct bfq_group, stats.empty_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.dequeue", .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, #endif /* CONFIG_BFQ_CGROUP_DEBUG */ { } /* terminate */ }; struct cftype bfq_blkg_files[] = { { .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight, .write = bfq_io_set_weight, }, {} /* terminate */ }; #else /* CONFIG_BFQ_GROUP_IOSCHED */ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) {} void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->weight = entity->new_weight; entity->orig_weight = entity->new_weight; if (bfqq) { bfqq->ioprio = bfqq->new_ioprio; bfqq->ioprio_class = bfqq->new_ioprio_class; } entity->sched_data = &bfqg->sched_data; } void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} void bfq_end_wr_async(struct bfq_data *bfqd) { bfq_end_wr_async_queues(bfqd, bfqd->root_group); } struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { return bfqd->root_group; } struct bfq_group *bfqq_group(struct bfq_queue *bfqq) { return bfqq->bfqd->root_group; } void bfqg_and_blkg_put(struct bfq_group *bfqg) {} struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { struct bfq_group *bfqg; int i; bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); if (!bfqg) return NULL; for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; return bfqg; } #endif /* CONFIG_BFQ_GROUP_IOSCHED */
243 243 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 // SPDX-License-Identifier: GPL-2.0-or-later /* * cn_proc.c - process events connector * * Copyright (C) Matt Helsley, IBM Corp. 2005 * Based on cn_fork.c by Guillaume Thouvenin <guillaume.thouvenin@bull.net> * Original copyright notice follows: * Copyright (C) 2005 BULL SA. */ #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/init.h> #include <linux/connector.h> #include <linux/gfp.h> #include <linux/ptrace.h> #include <linux/atomic.h> #include <linux/pid_namespace.h> #include <linux/cn_proc.h> #include <linux/local_lock.h> /* * Size of a cn_msg followed by a proc_event structure. Since the * sizeof struct cn_msg is a multiple of 4 bytes, but not 8 bytes, we * add one 4-byte word to the size here, and then start the actual * cn_msg structure 4 bytes into the stack buffer. The result is that * the immediately following proc_event structure is aligned to 8 bytes. */ #define CN_PROC_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct proc_event) + 4) /* See comment above; we test our assumption about sizeof struct cn_msg here. */ static inline struct cn_msg *buffer_to_cn_msg(__u8 *buffer) { BUILD_BUG_ON(sizeof(struct cn_msg) != 20); return (struct cn_msg *)(buffer + 4); } static atomic_t proc_event_num_listeners = ATOMIC_INIT(0); static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC }; /* local_event.count is used as the sequence number of the netlink message */ struct local_event { local_lock_t lock; __u32 count; }; static DEFINE_PER_CPU(struct local_event, local_event) = { .lock = INIT_LOCAL_LOCK(lock), }; static int cn_filter(struct sock *dsk, struct sk_buff *skb, void *data) { __u32 what, exit_code, *ptr; enum proc_cn_mcast_op mc_op; uintptr_t val; if (!dsk || !dsk->sk_user_data || !data) return 0; ptr = (__u32 *)data; what = *ptr++; exit_code = *ptr; val = ((struct proc_input *)(dsk->sk_user_data))->event_type; mc_op = ((struct proc_input *)(dsk->sk_user_data))->mcast_op; if (mc_op == PROC_CN_MCAST_IGNORE) return 1; if ((__u32)val == PROC_EVENT_ALL) return 0; /* * Drop packet if we have to report only non-zero exit status * (PROC_EVENT_NONZERO_EXIT) and exit status is 0 */ if (((__u32)val & PROC_EVENT_NONZERO_EXIT) && (what == PROC_EVENT_EXIT)) { if (exit_code) return 0; } if ((__u32)val & what) return 0; return 1; } static inline void send_msg(struct cn_msg *msg) { __u32 filter_data[2]; local_lock(&local_event.lock); msg->seq = __this_cpu_inc_return(local_event.count) - 1; ((struct proc_event *)msg->data)->cpu = smp_processor_id(); /* * local_lock() disables preemption during send to ensure the messages * are ordered according to their sequence numbers. * * If cn_netlink_send() fails, the data is not sent. */ filter_data[0] = ((struct proc_event *)msg->data)->what; if (filter_data[0] == PROC_EVENT_EXIT) { filter_data[1] = ((struct proc_event *)msg->data)->event_data.exit.exit_code; } else { filter_data[1] = 0; } cn_netlink_send_mult(msg, msg->len, 0, CN_IDX_PROC, GFP_NOWAIT, cn_filter, (void *)filter_data); local_unlock(&local_event.lock); } void proc_fork_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); struct task_struct *parent; if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_FORK; rcu_read_lock(); parent = rcu_dereference(task->real_parent); ev->event_data.fork.parent_pid = parent->pid; ev->event_data.fork.parent_tgid = parent->tgid; rcu_read_unlock(); ev->event_data.fork.child_pid = task->pid; ev->event_data.fork.child_tgid = task->tgid; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_exec_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_EXEC; ev->event_data.exec.process_pid = task->pid; ev->event_data.exec.process_tgid = task->tgid; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_id_connector(struct task_struct *task, int which_id) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); const struct cred *cred; if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->what = which_id; ev->event_data.id.process_pid = task->pid; ev->event_data.id.process_tgid = task->tgid; rcu_read_lock(); cred = __task_cred(task); if (which_id == PROC_EVENT_UID) { ev->event_data.id.r.ruid = from_kuid_munged(&init_user_ns, cred->uid); ev->event_data.id.e.euid = from_kuid_munged(&init_user_ns, cred->euid); } else if (which_id == PROC_EVENT_GID) { ev->event_data.id.r.rgid = from_kgid_munged(&init_user_ns, cred->gid); ev->event_data.id.e.egid = from_kgid_munged(&init_user_ns, cred->egid); } else { rcu_read_unlock(); return; } rcu_read_unlock(); ev->timestamp_ns = ktime_get_ns(); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_sid_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_SID; ev->event_data.sid.process_pid = task->pid; ev->event_data.sid.process_tgid = task->tgid; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_ptrace_connector(struct task_struct *task, int ptrace_id) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_PTRACE; ev->event_data.ptrace.process_pid = task->pid; ev->event_data.ptrace.process_tgid = task->tgid; if (ptrace_id == PTRACE_ATTACH) { ev->event_data.ptrace.tracer_pid = current->pid; ev->event_data.ptrace.tracer_tgid = current->tgid; } else if (ptrace_id == PTRACE_DETACH) { ev->event_data.ptrace.tracer_pid = 0; ev->event_data.ptrace.tracer_tgid = 0; } else return; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_comm_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_COMM; ev->event_data.comm.process_pid = task->pid; ev->event_data.comm.process_tgid = task->tgid; get_task_comm(ev->event_data.comm.comm, task); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_coredump_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; struct task_struct *parent; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_COREDUMP; ev->event_data.coredump.process_pid = task->pid; ev->event_data.coredump.process_tgid = task->tgid; rcu_read_lock(); if (pid_alive(task)) { parent = rcu_dereference(task->real_parent); ev->event_data.coredump.parent_pid = parent->pid; ev->event_data.coredump.parent_tgid = parent->tgid; } rcu_read_unlock(); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_exit_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; struct task_struct *parent; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_EXIT; ev->event_data.exit.process_pid = task->pid; ev->event_data.exit.process_tgid = task->tgid; ev->event_data.exit.exit_code = task->exit_code; ev->event_data.exit.exit_signal = task->exit_signal; rcu_read_lock(); if (pid_alive(task)) { parent = rcu_dereference(task->real_parent); ev->event_data.exit.parent_pid = parent->pid; ev->event_data.exit.parent_tgid = parent->tgid; } rcu_read_unlock(); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } /* * Send an acknowledgement message to userspace * * Use 0 for success, EFOO otherwise. * Note: this is the negative of conventional kernel error * values because it's not being returned via syscall return * mechanisms. */ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); msg->seq = rcvd_seq; ev->timestamp_ns = ktime_get_ns(); ev->cpu = -1; ev->what = PROC_EVENT_NONE; ev->event_data.ack.err = err; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = rcvd_ack + 1; msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } /** * cn_proc_mcast_ctl * @msg: message sent from userspace via the connector * @nsp: NETLINK_CB of the client's socket buffer */ static void cn_proc_mcast_ctl(struct cn_msg *msg, struct netlink_skb_parms *nsp) { enum proc_cn_mcast_op mc_op = 0, prev_mc_op = 0; struct proc_input *pinput = NULL; enum proc_cn_event ev_type = 0; int err = 0, initial = 0; struct sock *sk = NULL; /* * Events are reported with respect to the initial pid * and user namespaces so ignore requestors from * other namespaces. */ if ((current_user_ns() != &init_user_ns) || !task_is_in_init_pid_ns(current)) return; if (msg->len == sizeof(*pinput)) { pinput = (struct proc_input *)msg->data; mc_op = pinput->mcast_op; ev_type = pinput->event_type; } else if (msg->len == sizeof(mc_op)) { mc_op = *((enum proc_cn_mcast_op *)msg->data); ev_type = PROC_EVENT_ALL; } else { return; } ev_type = valid_event((enum proc_cn_event)ev_type); if (ev_type == PROC_EVENT_NONE) ev_type = PROC_EVENT_ALL; if (nsp->sk) { sk = nsp->sk; if (sk->sk_user_data == NULL) { sk->sk_user_data = kzalloc(sizeof(struct proc_input), GFP_KERNEL); if (sk->sk_user_data == NULL) { err = ENOMEM; goto out; } initial = 1; } else { prev_mc_op = ((struct proc_input *)(sk->sk_user_data))->mcast_op; } ((struct proc_input *)(sk->sk_user_data))->event_type = ev_type; ((struct proc_input *)(sk->sk_user_data))->mcast_op = mc_op; } switch (mc_op) { case PROC_CN_MCAST_LISTEN: if (initial || (prev_mc_op != PROC_CN_MCAST_LISTEN)) atomic_inc(&proc_event_num_listeners); break; case PROC_CN_MCAST_IGNORE: if (!initial && (prev_mc_op != PROC_CN_MCAST_IGNORE)) atomic_dec(&proc_event_num_listeners); ((struct proc_input *)(sk->sk_user_data))->event_type = PROC_EVENT_NONE; break; default: err = EINVAL; break; } out: cn_proc_ack(err, msg->seq, msg->ack); } /* * cn_proc_init - initialization entry point * * Adds the connector callback to the connector driver. */ static int __init cn_proc_init(void) { int err = cn_add_callback(&cn_proc_event_id, "cn_proc", &cn_proc_mcast_ctl); if (err) { pr_warn("cn_proc failed to register\n"); return err; } return 0; } device_initcall(cn_proc_init);
11 11 11 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 // SPDX-License-Identifier: GPL-2.0 /* * Provide a default dump_stack() function for architectures * which don't implement their own. */ #include <linux/kernel.h> #include <linux/buildid.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/smp.h> #include <linux/atomic.h> #include <linux/kexec.h> #include <linux/utsname.h> #include <linux/stop_machine.h> static char dump_stack_arch_desc_str[128]; /** * dump_stack_set_arch_desc - set arch-specific str to show with task dumps * @fmt: printf-style format string * @...: arguments for the format string * * The configured string will be printed right after utsname during task * dumps. Usually used to add arch-specific system identifiers. If an * arch wants to make use of such an ID string, it should initialize this * as soon as possible during boot. */ void __init dump_stack_set_arch_desc(const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), fmt, args); va_end(args); } #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) #define BUILD_ID_FMT " %20phN" #define BUILD_ID_VAL vmlinux_build_id #else #define BUILD_ID_FMT "%s" #define BUILD_ID_VAL "" #endif /** * dump_stack_print_info - print generic debug info for dump_stack() * @log_lvl: log level * * Arch-specific dump_stack() implementations can use this function to * print out the same debug information as the generic dump_stack(). */ void dump_stack_print_info(const char *log_lvl) { printk("%sCPU: %d UID: %u PID: %d Comm: %.20s %s%s %s %.*s" BUILD_ID_FMT "\n", log_lvl, raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), current->pid, current->comm, kexec_crash_loaded() ? "Kdump: loaded " : "", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version, BUILD_ID_VAL); if (get_taint()) printk("%s%s\n", log_lvl, print_tainted_verbose()); if (dump_stack_arch_desc_str[0] != '\0') printk("%sHardware name: %s\n", log_lvl, dump_stack_arch_desc_str); print_worker_info(log_lvl, current); print_stop_info(log_lvl, current); print_scx_info(log_lvl, current); } /** * show_regs_print_info - print generic debug info for show_regs() * @log_lvl: log level * * show_regs() implementations can use this function to print out generic * debug information. */ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); } static void __dump_stack(const char *log_lvl) { dump_stack_print_info(log_lvl); show_stack(NULL, NULL, log_lvl); } /** * dump_stack_lvl - dump the current task information and its stack trace * @log_lvl: log level * * Architectures can override this implementation by implementing its own. */ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) { bool in_panic = this_cpu_in_panic(); unsigned long flags; /* * Permit this cpu to perform nested stack dumps while serialising * against other CPUs, unless this CPU is in panic. * * When in panic, non-panic CPUs are not permitted to store new * printk messages so there is no need to synchronize the output. * This avoids potential deadlock in panic() if another CPU is * holding and unable to release the printk_cpu_sync. */ if (!in_panic) printk_cpu_sync_get_irqsave(flags); __dump_stack(log_lvl); if (!in_panic) printk_cpu_sync_put_irqrestore(flags); } EXPORT_SYMBOL(dump_stack_lvl); asmlinkage __visible void dump_stack(void) { dump_stack_lvl(KERN_DEFAULT); } EXPORT_SYMBOL(dump_stack);
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth address family and sockets. */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/stringify.h> #include <linux/sched/signal.h> #include <asm/ioctls.h> #include <net/bluetooth/bluetooth.h> #include <linux/proc_fs.h> #include "leds.h" #include "selftest.h" /* Bluetooth sockets */ #define BT_MAX_PROTO (BTPROTO_LAST + 1) static const struct net_proto_family *bt_proto[BT_MAX_PROTO]; static DEFINE_RWLOCK(bt_proto_lock); static struct lock_class_key bt_lock_key[BT_MAX_PROTO]; static const char *const bt_key_strings[BT_MAX_PROTO] = { "sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP", "sk_lock-AF_BLUETOOTH-BTPROTO_HCI", "sk_lock-AF_BLUETOOTH-BTPROTO_SCO", "sk_lock-AF_BLUETOOTH-BTPROTO_RFCOMM", "sk_lock-AF_BLUETOOTH-BTPROTO_BNEP", "sk_lock-AF_BLUETOOTH-BTPROTO_CMTP", "sk_lock-AF_BLUETOOTH-BTPROTO_HIDP", "sk_lock-AF_BLUETOOTH-BTPROTO_AVDTP", "sk_lock-AF_BLUETOOTH-BTPROTO_ISO", }; static struct lock_class_key bt_slock_key[BT_MAX_PROTO]; static const char *const bt_slock_key_strings[BT_MAX_PROTO] = { "slock-AF_BLUETOOTH-BTPROTO_L2CAP", "slock-AF_BLUETOOTH-BTPROTO_HCI", "slock-AF_BLUETOOTH-BTPROTO_SCO", "slock-AF_BLUETOOTH-BTPROTO_RFCOMM", "slock-AF_BLUETOOTH-BTPROTO_BNEP", "slock-AF_BLUETOOTH-BTPROTO_CMTP", "slock-AF_BLUETOOTH-BTPROTO_HIDP", "slock-AF_BLUETOOTH-BTPROTO_AVDTP", "slock-AF_BLUETOOTH-BTPROTO_ISO", }; void bt_sock_reclassify_lock(struct sock *sk, int proto) { BUG_ON(!sk); BUG_ON(!sock_allow_reclassification(sk)); sock_lock_init_class_and_name(sk, bt_slock_key_strings[proto], &bt_slock_key[proto], bt_key_strings[proto], &bt_lock_key[proto]); } EXPORT_SYMBOL(bt_sock_reclassify_lock); int bt_sock_register(int proto, const struct net_proto_family *ops) { int err = 0; if (proto < 0 || proto >= BT_MAX_PROTO) return -EINVAL; write_lock(&bt_proto_lock); if (bt_proto[proto]) err = -EEXIST; else bt_proto[proto] = ops; write_unlock(&bt_proto_lock); return err; } EXPORT_SYMBOL(bt_sock_register); void bt_sock_unregister(int proto) { if (proto < 0 || proto >= BT_MAX_PROTO) return; write_lock(&bt_proto_lock); bt_proto[proto] = NULL; write_unlock(&bt_proto_lock); } EXPORT_SYMBOL(bt_sock_unregister); static int bt_sock_create(struct net *net, struct socket *sock, int proto, int kern) { int err; if (net != &init_net) return -EAFNOSUPPORT; if (proto < 0 || proto >= BT_MAX_PROTO) return -EINVAL; if (!bt_proto[proto]) request_module("bt-proto-%d", proto); err = -EPROTONOSUPPORT; read_lock(&bt_proto_lock); if (bt_proto[proto] && try_module_get(bt_proto[proto]->owner)) { err = bt_proto[proto]->create(net, sock, proto, kern); if (!err) bt_sock_reclassify_lock(sock->sk, proto); module_put(bt_proto[proto]->owner); } read_unlock(&bt_proto_lock); return err; } struct sock *bt_sock_alloc(struct net *net, struct socket *sock, struct proto *prot, int proto, gfp_t prio, int kern) { struct sock *sk; sk = sk_alloc(net, PF_BLUETOOTH, prio, prot, kern); if (!sk) return NULL; sock_init_data(sock, sk); INIT_LIST_HEAD(&bt_sk(sk)->accept_q); sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = proto; sk->sk_state = BT_OPEN; /* Init peer information so it can be properly monitored */ if (!kern) { spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); spin_unlock(&sk->sk_peer_lock); } return sk; } EXPORT_SYMBOL(bt_sock_alloc); void bt_sock_link(struct bt_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_add_node(sk, &l->head); write_unlock(&l->lock); } EXPORT_SYMBOL(bt_sock_link); void bt_sock_unlink(struct bt_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_del_node_init(sk); write_unlock(&l->lock); } EXPORT_SYMBOL(bt_sock_unlink); bool bt_sock_linked(struct bt_sock_list *l, struct sock *s) { struct sock *sk; if (!l || !s) return false; read_lock(&l->lock); sk_for_each(sk, &l->head) { if (s == sk) { read_unlock(&l->lock); return true; } } read_unlock(&l->lock); return false; } EXPORT_SYMBOL(bt_sock_linked); void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) { const struct cred *old_cred; struct pid *old_pid; BT_DBG("parent %p, sk %p", parent, sk); sock_hold(sk); if (bh) bh_lock_sock_nested(sk); else lock_sock_nested(sk, SINGLE_DEPTH_NESTING); list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); bt_sk(sk)->parent = parent; /* Copy credentials from parent since for incoming connections the * socket is allocated by the kernel. */ spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(parent->sk_peer_pid); sk->sk_peer_cred = get_cred(parent->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); put_pid(old_pid); put_cred(old_cred); if (bh) bh_unlock_sock(sk); else release_sock(sk); sk_acceptq_added(parent); } EXPORT_SYMBOL(bt_accept_enqueue); /* Calling function must hold the sk lock. * bt_sk(sk)->parent must be non-NULL meaning sk is in the parent list. */ void bt_accept_unlink(struct sock *sk) { BT_DBG("sk %p state %d", sk, sk->sk_state); list_del_init(&bt_sk(sk)->accept_q); sk_acceptq_removed(bt_sk(sk)->parent); bt_sk(sk)->parent = NULL; sock_put(sk); } EXPORT_SYMBOL(bt_accept_unlink); struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) { struct bt_sock *s, *n; struct sock *sk; BT_DBG("parent %p", parent); restart: list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { sk = (struct sock *)s; /* Prevent early freeing of sk due to unlink and sock_kill */ sock_hold(sk); lock_sock(sk); /* Check sk has not already been unlinked via * bt_accept_unlink() due to serialisation caused by sk locking */ if (!bt_sk(sk)->parent) { BT_DBG("sk %p, already unlinked", sk); release_sock(sk); sock_put(sk); /* Restart the loop as sk is no longer in the list * and also avoid a potential infinite loop because * list_for_each_entry_safe() is not thread safe. */ goto restart; } /* sk is safely in the parent list so reduce reference count */ sock_put(sk); /* FIXME: Is this check still needed */ if (sk->sk_state == BT_CLOSED) { bt_accept_unlink(sk); release_sock(sk); continue; } if (sk->sk_state == BT_CONNECTED || !newsock || test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) { bt_accept_unlink(sk); if (newsock) sock_graft(sk, newsock); release_sock(sk); return sk; } release_sock(sk); } return NULL; } EXPORT_SYMBOL(bt_accept_dequeue); int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; size_t copied; size_t skblen; int err; BT_DBG("sock %p sk %p len %zu", sock, sk, len); if (flags & MSG_OOB) return -EOPNOTSUPP; skb = skb_recv_datagram(sk, flags, &err); if (!skb) { if (sk->sk_shutdown & RCV_SHUTDOWN) err = 0; return err; } skblen = skb->len; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } skb_reset_transport_header(skb); err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err == 0) { sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name && bt_sk(sk)->skb_msg_name) bt_sk(sk)->skb_msg_name(skb, msg->msg_name, &msg->msg_namelen); if (test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags)) { u8 pkt_status = hci_skb_pkt_status(skb); put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS, sizeof(pkt_status), &pkt_status); } } skb_free_datagram(sk, skb); if (flags & MSG_TRUNC) copied = skblen; return err ? : copied; } EXPORT_SYMBOL(bt_sock_recvmsg); static long bt_sock_data_wait(struct sock *sk, long timeo) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(sk_sleep(sk), &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (!skb_queue_empty(&sk->sk_receive_queue)) break; if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN)) break; if (signal_pending(current) || !timeo) break; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return timeo; } int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int err = 0; size_t target, copied = 0; long timeo; if (flags & MSG_OOB) return -EOPNOTSUPP; BT_DBG("sk %p size %zu", sk, size); lock_sock(sk); target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { struct sk_buff *skb; int chunk; skb = skb_dequeue(&sk->sk_receive_queue); if (!skb) { if (copied >= target) break; err = sock_error(sk); if (err) break; if (sk->sk_shutdown & RCV_SHUTDOWN) break; err = -EAGAIN; if (!timeo) break; timeo = bt_sock_data_wait(sk, timeo); if (signal_pending(current)) { err = sock_intr_errno(timeo); goto out; } continue; } chunk = min_t(unsigned int, skb->len, size); if (skb_copy_datagram_msg(skb, 0, msg, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (!copied) copied = -EFAULT; break; } copied += chunk; size -= chunk; sock_recv_cmsgs(msg, sk, skb); if (!(flags & MSG_PEEK)) { int skb_len = skb_headlen(skb); if (chunk <= skb_len) { __skb_pull(skb, chunk); } else { struct sk_buff *frag; __skb_pull(skb, skb_len); chunk -= skb_len; skb_walk_frags(skb, frag) { if (chunk <= frag->len) { /* Pulling partial data */ skb->len -= chunk; skb->data_len -= chunk; __skb_pull(frag, chunk); break; } else if (frag->len) { /* Pulling all frag data */ chunk -= frag->len; skb->len -= frag->len; skb->data_len -= frag->len; __skb_pull(frag, frag->len); } } } if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); break; } kfree_skb(skb); } else { /* put message back and return */ skb_queue_head(&sk->sk_receive_queue, skb); break; } } while (size); out: release_sock(sk); return copied ? : err; } EXPORT_SYMBOL(bt_sock_stream_recvmsg); static inline __poll_t bt_accept_poll(struct sock *parent) { struct bt_sock *s, *n; struct sock *sk; list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { sk = (struct sock *)s; if (sk->sk_state == BT_CONNECTED || (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) && sk->sk_state == BT_CONNECT2)) return EPOLLIN | EPOLLRDNORM; } return 0; } __poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == BT_LISTEN) return bt_accept_poll(sk); if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == BT_CLOSED) mask |= EPOLLHUP; if (sk->sk_state == BT_CONNECT || sk->sk_state == BT_CONNECT2 || sk->sk_state == BT_CONFIG) return mask; if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } EXPORT_SYMBOL(bt_sock_poll); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct sk_buff *skb; long amount; int err; BT_DBG("sk %p cmd %x arg %lx", sk, cmd, arg); switch (cmd) { case TIOCOUTQ: if (sk->sk_state == BT_LISTEN) return -EINVAL; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; err = put_user(amount, (int __user *)arg); break; case TIOCINQ: if (sk->sk_state == BT_LISTEN) return -EINVAL; spin_lock(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); amount = skb ? skb->len : 0; spin_unlock(&sk->sk_receive_queue.lock); err = put_user(amount, (int __user *)arg); break; default: err = -ENOIOCTLCMD; break; } return err; } EXPORT_SYMBOL(bt_sock_ioctl); /* This function expects the sk lock to be held when called */ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo) { DECLARE_WAITQUEUE(wait, current); int err = 0; BT_DBG("sk %p", sk); add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); while (sk->sk_state != state) { if (!timeo) { err = -EINPROGRESS; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); set_current_state(TASK_INTERRUPTIBLE); err = sock_error(sk); if (err) break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return err; } EXPORT_SYMBOL(bt_sock_wait_state); /* This function expects the sk lock to be held when called */ int bt_sock_wait_ready(struct sock *sk, unsigned int msg_flags) { DECLARE_WAITQUEUE(wait, current); unsigned long timeo; int err = 0; BT_DBG("sk %p", sk); timeo = sock_sndtimeo(sk, !!(msg_flags & MSG_DONTWAIT)); add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); while (test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags)) { if (!timeo) { err = -EAGAIN; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); set_current_state(TASK_INTERRUPTIBLE); err = sock_error(sk); if (err) break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return err; } EXPORT_SYMBOL(bt_sock_wait_ready); #ifdef CONFIG_PROC_FS static void *bt_seq_start(struct seq_file *seq, loff_t *pos) __acquires(seq->private->l->lock) { struct bt_sock_list *l = pde_data(file_inode(seq->file)); read_lock(&l->lock); return seq_hlist_start_head(&l->head, *pos); } static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bt_sock_list *l = pde_data(file_inode(seq->file)); return seq_hlist_next(v, &l->head, pos); } static void bt_seq_stop(struct seq_file *seq, void *v) __releases(seq->private->l->lock) { struct bt_sock_list *l = pde_data(file_inode(seq->file)); read_unlock(&l->lock); } static int bt_seq_show(struct seq_file *seq, void *v) { struct bt_sock_list *l = pde_data(file_inode(seq->file)); if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk RefCnt Rmem Wmem User Inode Parent"); if (l->custom_seq_show) { seq_putc(seq, ' '); l->custom_seq_show(seq, v); } seq_putc(seq, '\n'); } else { struct sock *sk = sk_entry(v); struct bt_sock *bt = bt_sk(sk); seq_printf(seq, "%pK %-6d %-6u %-6u %-6u %-6lu %-6lu", sk, refcount_read(&sk->sk_refcnt), sk_rmem_alloc_get(sk), sk_wmem_alloc_get(sk), from_kuid(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), bt->parent ? sock_i_ino(bt->parent) : 0LU); if (l->custom_seq_show) { seq_putc(seq, ' '); l->custom_seq_show(seq, v); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations bt_seq_ops = { .start = bt_seq_start, .next = bt_seq_next, .stop = bt_seq_stop, .show = bt_seq_show, }; int bt_procfs_init(struct net *net, const char *name, struct bt_sock_list *sk_list, int (*seq_show)(struct seq_file *, void *)) { sk_list->custom_seq_show = seq_show; if (!proc_create_seq_data(name, 0, net->proc_net, &bt_seq_ops, sk_list)) return -ENOMEM; return 0; } void bt_procfs_cleanup(struct net *net, const char *name) { remove_proc_entry(name, net->proc_net); } #else int bt_procfs_init(struct net *net, const char *name, struct bt_sock_list *sk_list, int (*seq_show)(struct seq_file *, void *)) { return 0; } void bt_procfs_cleanup(struct net *net, const char *name) { } #endif EXPORT_SYMBOL(bt_procfs_init); EXPORT_SYMBOL(bt_procfs_cleanup); static const struct net_proto_family bt_sock_family_ops = { .owner = THIS_MODULE, .family = PF_BLUETOOTH, .create = bt_sock_create, }; struct dentry *bt_debugfs; EXPORT_SYMBOL_GPL(bt_debugfs); #define VERSION __stringify(BT_SUBSYS_VERSION) "." \ __stringify(BT_SUBSYS_REVISION) static int __init bt_init(void) { int err; sock_skb_cb_check_size(sizeof(struct bt_skb_cb)); BT_INFO("Core ver %s", VERSION); err = bt_selftest(); if (err < 0) return err; bt_debugfs = debugfs_create_dir("bluetooth", NULL); bt_leds_init(); err = bt_sysfs_init(); if (err < 0) goto cleanup_led; err = sock_register(&bt_sock_family_ops); if (err) goto cleanup_sysfs; BT_INFO("HCI device and connection manager initialized"); err = hci_sock_init(); if (err) goto unregister_socket; err = l2cap_init(); if (err) goto cleanup_socket; err = sco_init(); if (err) goto cleanup_cap; err = mgmt_init(); if (err) goto cleanup_sco; return 0; cleanup_sco: sco_exit(); cleanup_cap: l2cap_exit(); cleanup_socket: hci_sock_cleanup(); unregister_socket: sock_unregister(PF_BLUETOOTH); cleanup_sysfs: bt_sysfs_cleanup(); cleanup_led: bt_leds_cleanup(); debugfs_remove_recursive(bt_debugfs); return err; } static void __exit bt_exit(void) { iso_exit(); mgmt_exit(); sco_exit(); l2cap_exit(); hci_sock_cleanup(); sock_unregister(PF_BLUETOOTH); bt_sysfs_cleanup(); bt_leds_cleanup(); debugfs_remove_recursive(bt_debugfs); } subsys_initcall(bt_init); module_exit(bt_exit); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); MODULE_DESCRIPTION("Bluetooth Core ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_BLUETOOTH);
44 44 44 36 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2004, Instant802 Networks, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2022 Intel Corporation */ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/types.h> #include <net/ip.h> #include <net/pkt_sched.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "wme.h" /* Default mapping in classifier to work with default * queue setup. */ const int ieee802_1d_to_ac[8] = { IEEE80211_AC_BE, IEEE80211_AC_BK, IEEE80211_AC_BK, IEEE80211_AC_BE, IEEE80211_AC_VI, IEEE80211_AC_VI, IEEE80211_AC_VO, IEEE80211_AC_VO }; static int wme_downgrade_ac(struct sk_buff *skb) { switch (skb->priority) { case 6: case 7: skb->priority = 5; /* VO -> VI */ return 0; case 4: case 5: skb->priority = 3; /* VI -> BE */ return 0; case 0: case 3: skb->priority = 2; /* BE -> BK */ return 0; default: return -1; } } /** * ieee80211_fix_reserved_tid - return the TID to use if this one is reserved * @tid: the assumed-reserved TID * * Returns: the alternative TID to use, or 0 on error */ static inline u8 ieee80211_fix_reserved_tid(u8 tid) { switch (tid) { case 0: return 3; case 1: return 2; case 2: return 1; case 3: return 0; case 4: return 5; case 5: return 4; case 6: return 7; case 7: return 6; } return 0; } static u16 ieee80211_downgrade_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; /* in case we are a client verify acm is not set for this ac */ while (sdata->wmm_acm & BIT(skb->priority)) { int ac = ieee802_1d_to_ac[skb->priority]; if (ifmgd->tx_tspec[ac].admitted_time && skb->priority == ifmgd->tx_tspec[ac].up) return ac; if (wme_downgrade_ac(skb)) { /* * This should not really happen. The AP has marked all * lower ACs to require admission control which is not * a reasonable configuration. Allow the frame to be * transmitted using AC_BK as a workaround. */ break; } } /* Check to see if this is a reserved TID */ if (sta && sta->reserved_tid == skb->priority) skb->priority = ieee80211_fix_reserved_tid(skb->priority); /* look up which queue to use for frames with this 1d tag */ return ieee802_1d_to_ac[skb->priority]; } /* Indicate which queue to use for this fully formed 802.11 frame */ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_hdr *hdr) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 *p; /* Ensure hash is set prior to potential SW encryption */ skb_get_hash(skb); if ((info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER) || local->hw.queues < IEEE80211_NUM_ACS) return 0; if (!ieee80211_is_data(hdr->frame_control)) { skb->priority = 7; return ieee802_1d_to_ac[skb->priority]; } if (!ieee80211_is_data_qos(hdr->frame_control)) { skb->priority = 0; return ieee802_1d_to_ac[skb->priority]; } p = ieee80211_get_qos_ctl(hdr); skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; return ieee80211_downgrade_queue(sdata, NULL, skb); } u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { const struct ethhdr *eth = (void *)skb->data; struct mac80211_qos_map *qos_map; bool qos; /* Ensure hash is set prior to potential SW encryption */ skb_get_hash(skb); /* all mesh/ocb stations are required to support WME */ if ((sdata->vif.type == NL80211_IFTYPE_MESH_POINT && !is_multicast_ether_addr(eth->h_dest)) || (sdata->vif.type == NL80211_IFTYPE_OCB && sta)) qos = true; else if (sta) qos = sta->sta.wme; else qos = false; if (!qos) { skb->priority = 0; /* required for correct WPA/11i MIC */ return IEEE80211_AC_BE; } if (skb->protocol == sdata->control_port_protocol) { skb->priority = 7; goto downgrade; } /* use the data classifier to determine what 802.1d tag the * data frame has */ qos_map = rcu_dereference(sdata->qos_map); skb->priority = cfg80211_classify8021d(skb, qos_map ? &qos_map->qos_map : NULL); downgrade: return ieee80211_downgrade_queue(sdata, sta, skb); } /** * ieee80211_set_qos_hdr - Fill in the QoS header if there is one. * * @sdata: local subif * @skb: packet to be updated */ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; u8 flags; u8 *p; if (!ieee80211_is_data_qos(hdr->frame_control)) return; p = ieee80211_get_qos_ctl(hdr); /* don't overwrite the QoS field of injected frames */ if (info->flags & IEEE80211_TX_CTL_INJECTED) { /* do take into account Ack policy of injected frames */ if (*p & IEEE80211_QOS_CTL_ACK_POLICY_NOACK) info->flags |= IEEE80211_TX_CTL_NO_ACK; return; } /* set up the first byte */ /* * preserve everything but the TID and ACK policy * (which we both write here) */ flags = *p & ~(IEEE80211_QOS_CTL_TID_MASK | IEEE80211_QOS_CTL_ACK_POLICY_MASK); if (is_multicast_ether_addr(hdr->addr1) || sdata->noack_map & BIT(tid)) { flags |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK; info->flags |= IEEE80211_TX_CTL_NO_ACK; } *p = flags | tid; /* set up the second byte */ p++; if (ieee80211_vif_is_mesh(&sdata->vif)) { /* preserve RSPI and Mesh PS Level bit */ *p &= ((IEEE80211_QOS_CTL_RSPI | IEEE80211_QOS_CTL_MESH_PS_LEVEL) >> 8); /* Nulls don't have a mesh header (frame body) */ if (!ieee80211_is_qos_nullfunc(hdr->frame_control)) *p |= (IEEE80211_QOS_CTL_MESH_CONTROL_PRESENT >> 8); } else { *p = 0; } }
4279 4271 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM workqueue #if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WORKQUEUE_H #include <linux/tracepoint.h> #include <linux/workqueue.h> struct pool_workqueue; /** * workqueue_queue_work - called when a work gets queued * @req_cpu: the requested cpu * @pwq: pointer to struct pool_workqueue * @work: pointer to struct work_struct * * This event occurs when a work is queued immediately or once a * delayed work is actually queued on a workqueue (ie: once the delay * has been reached). */ TRACE_EVENT(workqueue_queue_work, TP_PROTO(int req_cpu, struct pool_workqueue *pwq, struct work_struct *work), TP_ARGS(req_cpu, pwq, work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) __string( workqueue, pwq->wq->name) __field( int, req_cpu ) __field( int, cpu ) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; __assign_str(workqueue); __entry->req_cpu = req_cpu; __entry->cpu = pwq->pool->cpu; ), TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%d cpu=%d", __entry->work, __entry->function, __get_str(workqueue), __entry->req_cpu, __entry->cpu) ); /** * workqueue_activate_work - called when a work gets activated * @work: pointer to struct work_struct * * This event occurs when a queued work is put on the active queue, * which happens immediately after queueing unless @max_active limit * is reached. */ TRACE_EVENT(workqueue_activate_work, TP_PROTO(struct work_struct *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p function=%ps ", __entry->work, __entry->function) ); /** * workqueue_execute_start - called immediately before the workqueue callback * @work: pointer to struct work_struct * * Allows to track workqueue execution. */ TRACE_EVENT(workqueue_execute_start, TP_PROTO(struct work_struct *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /** * workqueue_execute_end - called immediately after the workqueue callback * @work: pointer to struct work_struct * @function: pointer to worker function * * Allows to track workqueue execution. */ TRACE_EVENT(workqueue_execute_end, TP_PROTO(struct work_struct *work, work_func_t function), TP_ARGS(work, function), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = function; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); #endif /* _TRACE_WORKQUEUE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 22 1 21 21 20 21 21 11 21 10 7 3 3 9 8 8 8 8 8 1 2 1 8 4 12 5 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> #include <net/inet_sock.h> #include <net/raw.h> #include <net/rawv6.h> #ifdef pr_fmt # undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt static struct raw_hashinfo * raw_get_hashinfo(const struct inet_diag_req_v2 *r) { if (r->sdiag_family == AF_INET) { return &raw_v4_hashinfo; #if IS_ENABLED(CONFIG_IPV6) } else if (r->sdiag_family == AF_INET6) { return &raw_v6_hashinfo; #endif } else { return ERR_PTR(-EINVAL); } } /* * Due to requirement of not breaking user API we can't simply * rename @pad field in inet_diag_req_v2 structure, instead * use helper to figure it out. */ static bool raw_lookup(struct net *net, const struct sock *sk, const struct inet_diag_req_v2 *req) { struct inet_diag_req_raw *r = (void *)req; if (r->sdiag_family == AF_INET) return raw_v4_match(net, sk, r->sdiag_raw_protocol, r->id.idiag_dst[0], r->id.idiag_src[0], r->id.idiag_if, 0); #if IS_ENABLED(CONFIG_IPV6) else return raw_v6_match(net, sk, r->sdiag_raw_protocol, (const struct in6_addr *)r->id.idiag_src, (const struct in6_addr *)r->id.idiag_dst, r->id.idiag_if, 0); #endif return false; } static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) { struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct hlist_head *hlist; struct sock *sk; int slot; if (IS_ERR(hashinfo)) return ERR_CAST(hashinfo); rcu_read_lock(); for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { if (raw_lookup(net, sk, r)) { /* * Grab it and keep until we fill * diag message to be reported, so * caller should call sock_put then. */ if (refcount_inc_not_zero(&sk->sk_refcnt)) goto out_unlock; } } } sk = ERR_PTR(-ENOENT); out_unlock: rcu_read_unlock(); return sk; } static int raw_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { struct sk_buff *in_skb = cb->skb; struct sk_buff *rep; struct sock *sk; struct net *net; int err; net = sock_net(in_skb->sk); sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) { sock_put(sk); return -ENOMEM; } err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); sock_put(sk); if (err < 0) { kfree_skb(rep); return err; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); return err; } static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); } static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct hlist_head *hlist; struct sock *sk = NULL; struct nlattr *bc; if (IS_ERR(hashinfo)) return; cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; rcu_read_lock(); for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { num = 0; hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) goto out_unlock; next: num++; } } out_unlock: rcu_read_unlock(); cb->args[0] = slot; cb->args[1] = num; } static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = sk_rmem_alloc_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int raw_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *r) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } #endif static const struct inet_diag_handler raw_diag_handler = { .owner = THIS_MODULE, .dump = raw_diag_dump, .dump_one = raw_diag_dump_one, .idiag_get_info = raw_diag_get_info, .idiag_type = IPPROTO_RAW, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = raw_diag_destroy, #endif }; static void __always_unused __check_inet_diag_req_raw(void) { /* * Make sure the two structures are identical, * except the @pad field. */ #define __offset_mismatch(m1, m2) \ (offsetof(struct inet_diag_req_v2, m1) != \ offsetof(struct inet_diag_req_raw, m2)) BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) != sizeof(struct inet_diag_req_raw)); BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family)); BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext)); BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states)); BUILD_BUG_ON(__offset_mismatch(id, id)); #undef __offset_mismatch } static int __init raw_diag_init(void) { return inet_diag_register(&raw_diag_handler); } static void __exit raw_diag_exit(void) { inet_diag_unregister(&raw_diag_handler); } module_init(raw_diag_init); module_exit(raw_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAW socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
33 32 33 31 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 // SPDX-License-Identifier: GPL-2.0-only /* * HT handling * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright 2017 Intel Deutschland GmbH * Copyright(c) 2020-2024 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "rate.h" static void __check_htcap_disable(struct ieee80211_ht_cap *ht_capa, struct ieee80211_ht_cap *ht_capa_mask, struct ieee80211_sta_ht_cap *ht_cap, u16 flag) { __le16 le_flag = cpu_to_le16(flag); if (ht_capa_mask->cap_info & le_flag) { if (!(ht_capa->cap_info & le_flag)) ht_cap->cap &= ~flag; } } static void __check_htcap_enable(struct ieee80211_ht_cap *ht_capa, struct ieee80211_ht_cap *ht_capa_mask, struct ieee80211_sta_ht_cap *ht_cap, u16 flag) { __le16 le_flag = cpu_to_le16(flag); if ((ht_capa_mask->cap_info & le_flag) && (ht_capa->cap_info & le_flag)) ht_cap->cap |= flag; } void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap *ht_cap) { struct ieee80211_ht_cap *ht_capa, *ht_capa_mask; u8 *scaps, *smask; int i; if (!ht_cap->ht_supported) return; switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: ht_capa = &sdata->u.mgd.ht_capa; ht_capa_mask = &sdata->u.mgd.ht_capa_mask; break; case NL80211_IFTYPE_ADHOC: ht_capa = &sdata->u.ibss.ht_capa; ht_capa_mask = &sdata->u.ibss.ht_capa_mask; break; default: WARN_ON_ONCE(1); return; } scaps = (u8 *)(&ht_capa->mcs.rx_mask); smask = (u8 *)(&ht_capa_mask->mcs.rx_mask); /* NOTE: If you add more over-rides here, update register_hw * ht_capa_mod_mask logic in main.c as well. * And, if this method can ever change ht_cap.ht_supported, fix * the check in ieee80211_add_ht_ie. */ /* check for HT over-rides, MCS rates first. */ for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) { u8 m = smask[i]; ht_cap->mcs.rx_mask[i] &= ~m; /* turn off all masked bits */ /* Add back rates that are supported */ ht_cap->mcs.rx_mask[i] |= (m & scaps[i]); } /* Force removal of HT-40 capabilities? */ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_SUP_WIDTH_20_40); __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_SGI_40); /* Allow user to disable SGI-20 (SGI-40 is handled above) */ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_SGI_20); /* Allow user to disable the max-AMSDU bit. */ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_MAX_AMSDU); /* Allow user to disable LDPC */ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_LDPC_CODING); /* Allow user to enable 40 MHz intolerant bit. */ __check_htcap_enable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_40MHZ_INTOLERANT); /* Allow user to enable TX STBC bit */ __check_htcap_enable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_TX_STBC); /* Allow user to configure RX STBC bits */ if (ht_capa_mask->cap_info & cpu_to_le16(IEEE80211_HT_CAP_RX_STBC)) ht_cap->cap |= le16_to_cpu(ht_capa->cap_info) & IEEE80211_HT_CAP_RX_STBC; /* Allow user to decrease AMPDU factor */ if (ht_capa_mask->ampdu_params_info & IEEE80211_HT_AMPDU_PARM_FACTOR) { u8 n = ht_capa->ampdu_params_info & IEEE80211_HT_AMPDU_PARM_FACTOR; if (n < ht_cap->ampdu_factor) ht_cap->ampdu_factor = n; } /* Allow the user to increase AMPDU density. */ if (ht_capa_mask->ampdu_params_info & IEEE80211_HT_AMPDU_PARM_DENSITY) { u8 n = (ht_capa->ampdu_params_info & IEEE80211_HT_AMPDU_PARM_DENSITY) >> IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT; if (n > ht_cap->ampdu_density) ht_cap->ampdu_density = n; } } bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_ht_cap *ht_cap_ie, struct link_sta_info *link_sta) { struct ieee80211_bss_conf *link_conf; struct sta_info *sta = link_sta->sta; struct ieee80211_sta_ht_cap ht_cap, own_cap; u8 ampdu_info, tx_mcs_set_cap; int i, max_tx_streams; bool changed; enum ieee80211_sta_rx_bandwidth bw; enum nl80211_chan_width width; memset(&ht_cap, 0, sizeof(ht_cap)); if (!ht_cap_ie || !sband->ht_cap.ht_supported) goto apply; ht_cap.ht_supported = true; own_cap = sband->ht_cap; /* * If user has specified capability over-rides, take care * of that if the station we're setting up is the AP or TDLS peer that * we advertised a restricted capability set to. Override * our own capabilities and then use those below. */ if (sdata->vif.type == NL80211_IFTYPE_STATION || sdata->vif.type == NL80211_IFTYPE_ADHOC) ieee80211_apply_htcap_overrides(sdata, &own_cap); /* * The bits listed in this expression should be * the same for the peer and us, if the station * advertises more then we can't use those thus * we mask them out. */ ht_cap.cap = le16_to_cpu(ht_cap_ie->cap_info) & (own_cap.cap | ~(IEEE80211_HT_CAP_LDPC_CODING | IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40)); /* * The STBC bits are asymmetric -- if we don't have * TX then mask out the peer's RX and vice versa. */ if (!(own_cap.cap & IEEE80211_HT_CAP_TX_STBC)) ht_cap.cap &= ~IEEE80211_HT_CAP_RX_STBC; if (!(own_cap.cap & IEEE80211_HT_CAP_RX_STBC)) ht_cap.cap &= ~IEEE80211_HT_CAP_TX_STBC; ampdu_info = ht_cap_ie->ampdu_params_info; ht_cap.ampdu_factor = ampdu_info & IEEE80211_HT_AMPDU_PARM_FACTOR; ht_cap.ampdu_density = (ampdu_info & IEEE80211_HT_AMPDU_PARM_DENSITY) >> 2; /* own MCS TX capabilities */ tx_mcs_set_cap = own_cap.mcs.tx_params; /* Copy peer MCS TX capabilities, the driver might need them. */ ht_cap.mcs.tx_params = ht_cap_ie->mcs.tx_params; /* can we TX with MCS rates? */ if (!(tx_mcs_set_cap & IEEE80211_HT_MCS_TX_DEFINED)) goto apply; /* Counting from 0, therefore +1 */ if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_RX_DIFF) max_tx_streams = ((tx_mcs_set_cap & IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK) >> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1; else max_tx_streams = IEEE80211_HT_MCS_TX_MAX_STREAMS; /* * 802.11n-2009 20.3.5 / 20.6 says: * - indices 0 to 7 and 32 are single spatial stream * - 8 to 31 are multiple spatial streams using equal modulation * [8..15 for two streams, 16..23 for three and 24..31 for four] * - remainder are multiple spatial streams using unequal modulation */ for (i = 0; i < max_tx_streams; i++) ht_cap.mcs.rx_mask[i] = own_cap.mcs.rx_mask[i] & ht_cap_ie->mcs.rx_mask[i]; if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION) for (i = IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE; i < IEEE80211_HT_MCS_MASK_LEN; i++) ht_cap.mcs.rx_mask[i] = own_cap.mcs.rx_mask[i] & ht_cap_ie->mcs.rx_mask[i]; /* handle MCS rate 32 too */ if (own_cap.mcs.rx_mask[32/8] & ht_cap_ie->mcs.rx_mask[32/8] & 1) ht_cap.mcs.rx_mask[32/8] |= 1; /* set Rx highest rate */ ht_cap.mcs.rx_highest = ht_cap_ie->mcs.rx_highest; if (ht_cap.cap & IEEE80211_HT_CAP_MAX_AMSDU) link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_7935; else link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_3839; ieee80211_sta_recalc_aggregates(&sta->sta); apply: changed = memcmp(&link_sta->pub->ht_cap, &ht_cap, sizeof(ht_cap)); memcpy(&link_sta->pub->ht_cap, &ht_cap, sizeof(ht_cap)); rcu_read_lock(); link_conf = rcu_dereference(sdata->vif.link_conf[link_sta->link_id]); if (WARN_ON(!link_conf)) width = NL80211_CHAN_WIDTH_20_NOHT; else width = link_conf->chanreq.oper.width; switch (width) { default: WARN_ON_ONCE(1); fallthrough; case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: bw = IEEE80211_STA_RX_BW_20; break; case NL80211_CHAN_WIDTH_40: case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_320: bw = ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; break; } rcu_read_unlock(); link_sta->pub->bandwidth = bw; link_sta->cur_max_bandwidth = ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { enum ieee80211_smps_mode smps_mode; switch ((ht_cap.cap & IEEE80211_HT_CAP_SM_PS) >> IEEE80211_HT_CAP_SM_PS_SHIFT) { case WLAN_HT_CAP_SM_PS_INVALID: case WLAN_HT_CAP_SM_PS_STATIC: smps_mode = IEEE80211_SMPS_STATIC; break; case WLAN_HT_CAP_SM_PS_DYNAMIC: smps_mode = IEEE80211_SMPS_DYNAMIC; break; case WLAN_HT_CAP_SM_PS_DISABLED: smps_mode = IEEE80211_SMPS_OFF; break; } if (smps_mode != link_sta->pub->smps_mode) changed = true; link_sta->pub->smps_mode = smps_mode; } else { link_sta->pub->smps_mode = IEEE80211_SMPS_OFF; } return changed; } void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason) { int i; lockdep_assert_wiphy(sta->local->hw.wiphy); for (i = 0; i < IEEE80211_NUM_TIDS; i++) __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, WLAN_REASON_QSTA_LEAVE_QBSS, reason != AGG_STOP_DESTROY_STA && reason != AGG_STOP_PEER_REQUEST); for (i = 0; i < IEEE80211_NUM_TIDS; i++) __ieee80211_stop_tx_ba_session(sta, i, reason); /* * In case the tear down is part of a reconfigure due to HW restart * request, it is possible that the low level driver requested to stop * the BA session, so handle it to properly clean tid_tx data. */ if(reason == AGG_STOP_DESTROY_STA) { wiphy_work_cancel(sta->local->hw.wiphy, &sta->ampdu_mlme.work); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { struct tid_ampdu_tx *tid_tx = rcu_dereference_protected_tid_tx(sta, i); if (!tid_tx) continue; if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state)) ieee80211_stop_tx_ba_cb(sta, i, tid_tx); } } } void ieee80211_ba_session_work(struct wiphy *wiphy, struct wiphy_work *work) { struct sta_info *sta = container_of(work, struct sta_info, ampdu_mlme.work); struct tid_ampdu_tx *tid_tx; bool blocked; int tid; lockdep_assert_wiphy(sta->local->hw.wiphy); /* When this flag is set, new sessions should be blocked. */ blocked = test_sta_flag(sta, WLAN_STA_BLOCK_BA); for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired)) __ieee80211_stop_rx_ba_session( sta, tid, WLAN_BACK_RECIPIENT, WLAN_REASON_QSTA_TIMEOUT, true); if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_stop_requested)) __ieee80211_stop_rx_ba_session( sta, tid, WLAN_BACK_RECIPIENT, WLAN_REASON_UNSPECIFIED, true); if (!blocked && test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl)) __ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, IEEE80211_MAX_AMPDU_BUF_HT, false, true, 0); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, sta->ampdu_mlme.tid_rx_manage_offl)) __ieee80211_stop_rx_ba_session( sta, tid, WLAN_BACK_RECIPIENT, 0, false); spin_lock_bh(&sta->lock); tid_tx = sta->ampdu_mlme.tid_start_tx[tid]; if (!blocked && tid_tx) { struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); struct ieee80211_sub_if_data *sdata = vif_to_sdata(txqi->txq.vif); struct fq *fq = &sdata->local->fq; spin_lock_bh(&fq->lock); /* Allow only frags to be dequeued */ set_bit(IEEE80211_TXQ_STOP, &txqi->flags); if (!skb_queue_empty(&txqi->frags)) { /* Fragmented Tx is ongoing, wait for it to * finish. Reschedule worker to retry later. */ spin_unlock_bh(&fq->lock); spin_unlock_bh(&sta->lock); /* Give the task working on the txq a chance * to send out the queued frags */ synchronize_net(); wiphy_work_queue(sdata->local->hw.wiphy, work); return; } spin_unlock_bh(&fq->lock); /* * Assign it over to the normal tid_tx array * where it "goes live". */ sta->ampdu_mlme.tid_start_tx[tid] = NULL; /* could there be a race? */ if (sta->ampdu_mlme.tid_tx[tid]) kfree(tid_tx); else ieee80211_assign_tid_tx(sta, tid, tid_tx); spin_unlock_bh(&sta->lock); ieee80211_tx_ba_session_handle_start(sta, tid); continue; } spin_unlock_bh(&sta->lock); tid_tx = rcu_dereference_protected_tid_tx(sta, tid); if (!tid_tx) continue; if (!blocked && test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state)) ieee80211_start_tx_ba_cb(sta, tid, tid_tx); if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state)) __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_LOCAL_REQUEST); if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state)) ieee80211_stop_tx_ba_cb(sta, tid, tid_tx); } } void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u16 initiator, u16 reason_code) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u16 params; skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = ieee80211_mgmt_ba(skb, da, sdata); skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba)); mgmt->u.action.category = WLAN_CATEGORY_BACK; mgmt->u.action.u.delba.action_code = WLAN_ACTION_DELBA; params = (u16)(initiator << 11); /* bit 11 initiator */ params |= (u16)(tid << 12); /* bit 15:12 TID number */ mgmt->u.action.u.delba.params = cpu_to_le16(params); mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code); ieee80211_tx_skb(sdata, skb); } void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len) { u16 tid, params; u16 initiator; params = le16_to_cpu(mgmt->u.action.u.delba.params); tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12; initiator = (params & IEEE80211_DELBA_PARAM_INITIATOR_MASK) >> 11; ht_dbg_ratelimited(sdata, "delba from %pM (%s) tid %d reason code %d\n", mgmt->sa, initiator ? "initiator" : "recipient", tid, le16_to_cpu(mgmt->u.action.u.delba.reason_code)); if (initiator == WLAN_BACK_INITIATOR) __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0, true); else __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_PEER_REQUEST); } enum nl80211_smps_mode ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps) { switch (smps) { case IEEE80211_SMPS_OFF: return NL80211_SMPS_OFF; case IEEE80211_SMPS_STATIC: return NL80211_SMPS_STATIC; case IEEE80211_SMPS_DYNAMIC: return NL80211_SMPS_DYNAMIC; default: return NL80211_SMPS_OFF; } } int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid, int link_id) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *action_frame; struct ieee80211_tx_info *info; u8 status_link_id = link_id < 0 ? 0 : link_id; /* 27 = header + category + action + smps mode */ skb = dev_alloc_skb(27 + local->hw.extra_tx_headroom); if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom); action_frame = skb_put(skb, 27); memcpy(action_frame->da, da, ETH_ALEN); memcpy(action_frame->sa, sdata->dev->dev_addr, ETH_ALEN); memcpy(action_frame->bssid, bssid, ETH_ALEN); action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); action_frame->u.action.category = WLAN_CATEGORY_HT; action_frame->u.action.u.ht_smps.action = WLAN_HT_ACTION_SMPS; switch (smps) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); smps = IEEE80211_SMPS_OFF; fallthrough; case IEEE80211_SMPS_OFF: action_frame->u.action.u.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DISABLED; break; case IEEE80211_SMPS_STATIC: action_frame->u.action.u.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_STATIC; break; case IEEE80211_SMPS_DYNAMIC: action_frame->u.action.u.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DYNAMIC; break; } /* we'll do more on status of this frame */ info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; /* we have 13 bits, and need 6: link_id 4, smps 2 */ info->status_data = IEEE80211_STATUS_TYPE_SMPS | u16_encode_bits(status_link_id << 2 | smps, IEEE80211_STATUS_SUBDATA_MASK); ieee80211_tx_skb_tid(sdata, skb, 7, link_id); return 0; } void ieee80211_request_smps(struct ieee80211_vif *vif, unsigned int link_id, enum ieee80211_smps_mode smps_mode) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; if (WARN_ON_ONCE(vif->type != NL80211_IFTYPE_STATION)) return; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) goto out; trace_api_request_smps(sdata->local, sdata, link, smps_mode); if (link->u.mgd.driver_smps_mode == smps_mode) goto out; link->u.mgd.driver_smps_mode = smps_mode; wiphy_work_queue(sdata->local->hw.wiphy, &link->u.mgd.request_smps_work); out: rcu_read_unlock(); } /* this might change ... don't want non-open drivers using it */ EXPORT_SYMBOL_GPL(ieee80211_request_smps);
65 65 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> * (C) 2011 Intra2net AG <https://www.intra2net.com> */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/errno.h> #include <net/netlink.h> #include <net/sock.h> #include <net/netns/generic.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_acct.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure"); struct nf_acct { atomic64_t pkts; atomic64_t bytes; unsigned long flags; struct list_head head; refcount_t refcnt; char name[NFACCT_NAME_MAX]; struct rcu_head rcu_head; char data[]; }; struct nfacct_filter { u32 value; u32 mask; }; struct nfnl_acct_net { struct list_head nfnl_acct_list; }; static unsigned int nfnl_acct_net_id __read_mostly; static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net) { return net_generic(net, nfnl_acct_net_id); } #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) #define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ static int nfnl_acct_new(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); struct nf_acct *nfacct, *matching = NULL; unsigned int size = 0; char *acct_name; u32 flags = 0; if (!tb[NFACCT_NAME]) return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); if (strlen(acct_name) == 0) return -EINVAL; list_for_each_entry(nfacct, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) continue; if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = nfacct; break; } if (matching) { if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* reset counters if you request a replacement. */ atomic64_set(&matching->pkts, 0); atomic64_set(&matching->bytes, 0); smp_mb__before_atomic(); /* reset overquota flag if quota is enabled. */ if ((matching->flags & NFACCT_F_QUOTA)) clear_bit(NFACCT_OVERQUOTA_BIT, &matching->flags); return 0; } return -EBUSY; } if (tb[NFACCT_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS])); if (flags & ~NFACCT_F_QUOTA) return -EOPNOTSUPP; if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA) return -EINVAL; if (flags & NFACCT_F_OVERQUOTA) return -EINVAL; if ((flags & NFACCT_F_QUOTA) && !tb[NFACCT_QUOTA]) return -EINVAL; size += sizeof(u64); } nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL); if (nfacct == NULL) return -ENOMEM; if (flags & NFACCT_F_QUOTA) { u64 *quota = (u64 *)nfacct->data; *quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA])); nfacct->flags = flags; } nla_strscpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX); if (tb[NFACCT_BYTES]) { atomic64_set(&nfacct->bytes, be64_to_cpu(nla_get_be64(tb[NFACCT_BYTES]))); } if (tb[NFACCT_PKTS]) { atomic64_set(&nfacct->pkts, be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); } refcount_set(&nfacct->refcnt, 1); list_add_tail_rcu(&nfacct->head, &nfnl_acct_net->nfnl_acct_list); return 0; } static int nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_acct *acct) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; u64 pkts, bytes; u32 old_flags; event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_string(skb, NFACCT_NAME, acct->name)) goto nla_put_failure; old_flags = acct->flags; if (type == NFNL_MSG_ACCT_GET_CTRZERO) { pkts = atomic64_xchg(&acct->pkts, 0); bytes = atomic64_xchg(&acct->bytes, 0); smp_mb__before_atomic(); if (acct->flags & NFACCT_F_QUOTA) clear_bit(NFACCT_OVERQUOTA_BIT, &acct->flags); } else { pkts = atomic64_read(&acct->pkts); bytes = atomic64_read(&acct->bytes); } if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts), NFACCT_PAD) || nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes), NFACCT_PAD) || nla_put_be32(skb, NFACCT_USE, htonl(refcount_read(&acct->refcnt)))) goto nla_put_failure; if (acct->flags & NFACCT_F_QUOTA) { u64 *quota = (u64 *)acct->data; if (nla_put_be32(skb, NFACCT_FLAGS, htonl(old_flags)) || nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota), NFACCT_PAD)) goto nla_put_failure; } nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *last; const struct nfacct_filter *filter = cb->data; if (cb->args[2]) return 0; last = (struct nf_acct *)cb->args[1]; if (cb->args[1]) cb->args[1] = 0; rcu_read_lock(); list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (last) { if (cur != last) continue; last = NULL; } if (filter && (cur->flags & filter->mask) != filter->value) continue; if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), NFNL_MSG_ACCT_NEW, cur) < 0) { cb->args[1] = (unsigned long)cur; break; } } if (!cb->args[1]) cb->args[2] = 1; rcu_read_unlock(); return skb->len; } static int nfnl_acct_done(struct netlink_callback *cb) { kfree(cb->data); return 0; } static const struct nla_policy filter_policy[NFACCT_FILTER_MAX + 1] = { [NFACCT_FILTER_MASK] = { .type = NLA_U32 }, [NFACCT_FILTER_VALUE] = { .type = NLA_U32 }, }; static int nfnl_acct_start(struct netlink_callback *cb) { const struct nlattr *const attr = cb->data; struct nlattr *tb[NFACCT_FILTER_MAX + 1]; struct nfacct_filter *filter; int err; if (!attr) return 0; err = nla_parse_nested_deprecated(tb, NFACCT_FILTER_MAX, attr, filter_policy, NULL); if (err < 0) return err; if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE]) return -EINVAL; filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL); if (!filter) return -ENOMEM; filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK])); filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE])); cb->data = filter; return 0; } static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_acct_dump, .start = nfnl_acct_start, .done = nfnl_acct_done, .data = (void *)tb[NFACCT_FILTER], }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!tb[NFACCT_NAME]) return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) continue; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { ret = -ENOMEM; break; } ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_ACCT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); break; } return ret; } /* try to delete object, fail if it is still in use. */ static int nfnl_acct_try_del(struct nf_acct *cur) { int ret = 0; /* We want to avoid races with nfnl_acct_put. So only when the current * refcnt is 1, we decrease it to 0. */ if (refcount_dec_if_one(&cur->refcnt)) { /* We are protected by nfnl mutex. */ list_del_rcu(&cur->head); kfree_rcu(cur, rcu_head); } else { ret = -EBUSY; } return ret; } static int nfnl_acct_del(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); struct nf_acct *cur, *tmp; int ret = -ENOENT; char *acct_name; if (!tb[NFACCT_NAME]) { list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) nfnl_acct_try_del(cur); return 0; } acct_name = nla_data(tb[NFACCT_NAME]); list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) continue; ret = nfnl_acct_try_del(cur); if (ret < 0) return ret; break; } return ret; } static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { [NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 }, [NFACCT_BYTES] = { .type = NLA_U64 }, [NFACCT_PKTS] = { .type = NLA_U64 }, [NFACCT_FLAGS] = { .type = NLA_U32 }, [NFACCT_QUOTA] = { .type = NLA_U64 }, [NFACCT_FILTER] = {.type = NLA_NESTED }, }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new, .type = NFNL_CB_MUTEX, .attr_count = NFACCT_MAX, .policy = nfnl_acct_policy }, [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get, .type = NFNL_CB_MUTEX, .attr_count = NFACCT_MAX, .policy = nfnl_acct_policy }, [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get, .type = NFNL_CB_MUTEX, .attr_count = NFACCT_MAX, .policy = nfnl_acct_policy }, [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del, .type = NFNL_CB_MUTEX, .attr_count = NFACCT_MAX, .policy = nfnl_acct_policy }, }; static const struct nfnetlink_subsystem nfnl_acct_subsys = { .name = "acct", .subsys_id = NFNL_SUBSYS_ACCT, .cb_count = NFNL_MSG_ACCT_MAX, .cb = nfnl_acct_cb, }; MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name) { struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *acct = NULL; rcu_read_lock(); list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) continue; if (!try_module_get(THIS_MODULE)) goto err; if (!refcount_inc_not_zero(&cur->refcnt)) { module_put(THIS_MODULE); goto err; } acct = cur; break; } err: rcu_read_unlock(); return acct; } EXPORT_SYMBOL_GPL(nfnl_acct_find_get); void nfnl_acct_put(struct nf_acct *acct) { if (refcount_dec_and_test(&acct->refcnt)) kfree_rcu(acct, rcu_head); module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(nfnl_acct_put); void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct) { atomic64_inc(&nfacct->pkts); atomic64_add(skb->len, &nfacct->bytes); } EXPORT_SYMBOL_GPL(nfnl_acct_update); static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct) { int ret; struct sk_buff *skb; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (skb == NULL) return; ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0, nfacct); if (ret <= 0) { kfree_skb(skb); return; } nfnetlink_broadcast(net, skb, 0, NFNLGRP_ACCT_QUOTA, GFP_ATOMIC); } int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct) { u64 now; u64 *quota; int ret = NFACCT_UNDERQUOTA; /* no place here if we don't have a quota */ if (!(nfacct->flags & NFACCT_F_QUOTA)) return NFACCT_NO_QUOTA; quota = (u64 *)nfacct->data; now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ? atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes); ret = now > *quota; if (now >= *quota && !test_and_set_bit(NFACCT_OVERQUOTA_BIT, &nfacct->flags)) { nfnl_overquota_report(net, nfacct); } return ret; } EXPORT_SYMBOL_GPL(nfnl_acct_overquota); static int __net_init nfnl_acct_net_init(struct net *net) { INIT_LIST_HEAD(&nfnl_acct_pernet(net)->nfnl_acct_list); return 0; } static void __net_exit nfnl_acct_net_exit(struct net *net) { struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *tmp; list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) { list_del_rcu(&cur->head); if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); } } static struct pernet_operations nfnl_acct_ops = { .init = nfnl_acct_net_init, .exit = nfnl_acct_net_exit, .id = &nfnl_acct_net_id, .size = sizeof(struct nfnl_acct_net), }; static int __init nfnl_acct_init(void) { int ret; ret = register_pernet_subsys(&nfnl_acct_ops); if (ret < 0) { pr_err("nfnl_acct_init: failed to register pernet ops\n"); goto err_out; } ret = nfnetlink_subsys_register(&nfnl_acct_subsys); if (ret < 0) { pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); goto cleanup_pernet; } return 0; cleanup_pernet: unregister_pernet_subsys(&nfnl_acct_ops); err_out: return ret; } static void __exit nfnl_acct_exit(void) { nfnetlink_subsys_unregister(&nfnl_acct_subsys); unregister_pernet_subsys(&nfnl_acct_ops); } module_init(nfnl_acct_init); module_exit(nfnl_acct_exit);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 // SPDX-License-Identifier: GPL-2.0-or-later /* * sctp_offload - GRO/GSO Offloading for SCTP * * Copyright (C) 2015, Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/socket.h> #include <linux/sctp.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/kfifo.h> #include <linux/time.h> #include <net/net_namespace.h> #include <linux/skbuff.h> #include <net/sctp/sctp.h> #include <net/sctp/checksum.h> #include <net/protocol.h> #include <net/gso.h> static __le32 sctp_gso_make_checksum(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; skb->csum_not_inet = 0; /* csum and csum_start in GSO CB may be needed to do the UDP * checksum when it's a UDP tunneling packet. */ SKB_GSO_CB(skb)->csum = (__force __wsum)~0; SKB_GSO_CB(skb)->csum_start = skb_headroom(skb) + skb->len; return sctp_compute_cksum(skb, skb_transport_offset(skb)); } static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct sctphdr *sh; if (!skb_is_gso_sctp(skb)) goto out; sh = sctp_hdr(skb); if (!pskb_may_pull(skb, sizeof(*sh))) goto out; __skb_pull(skb, sizeof(*sh)); if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ struct skb_shared_info *pinfo = skb_shinfo(skb); struct sk_buff *frag_iter; pinfo->gso_segs = 0; if (skb->len != skb->data_len) { /* Means we have chunks in here too */ pinfo->gso_segs++; } skb_walk_frags(skb, frag_iter) pinfo->gso_segs++; segs = NULL; goto out; } segs = skb_segment(skb, (features | NETIF_F_HW_CSUM) & ~NETIF_F_SG); if (IS_ERR(segs)) goto out; /* All that is left is update SCTP CRC if necessary */ if (!(features & NETIF_F_SCTP_CRC)) { for (skb = segs; skb; skb = skb->next) { if (skb->ip_summed == CHECKSUM_PARTIAL) { sh = sctp_hdr(skb); sh->checksum = sctp_gso_make_checksum(skb); } } } out: return segs; } static const struct net_offload sctp_offload = { .callbacks = { .gso_segment = sctp_gso_segment, }, }; static const struct net_offload sctp6_offload = { .callbacks = { .gso_segment = sctp_gso_segment, }, }; int __init sctp_offload_init(void) { int ret; ret = inet_add_offload(&sctp_offload, IPPROTO_SCTP); if (ret) goto out; ret = inet6_add_offload(&sctp6_offload, IPPROTO_SCTP); if (ret) goto ipv4; crc32c_csum_stub = &sctp_csum_ops; return ret; ipv4: inet_del_offload(&sctp_offload, IPPROTO_SCTP); out: return ret; }
8057 8057 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_COMPAT_H #define _ASM_X86_COMPAT_H /* * Architecture specific compatibility types */ #include <linux/types.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <asm/processor.h> #include <asm/user32.h> #include <asm/unistd.h> #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; #define __compat_uid_t __compat_uid_t typedef u16 __compat_uid_t; typedef u16 __compat_gid_t; #define compat_dev_t compat_dev_t typedef u16 compat_dev_t; #define compat_ipc_pid_t compat_ipc_pid_t typedef u16 compat_ipc_pid_t; #define compat_statfs compat_statfs #include <asm-generic/compat.h> #define COMPAT_UTS_MACHINE "i686\0\0" typedef u16 compat_nlink_t; struct compat_stat { u32 st_dev; compat_ino_t st_ino; compat_mode_t st_mode; compat_nlink_t st_nlink; __compat_uid_t st_uid; __compat_gid_t st_gid; u32 st_rdev; u32 st_size; u32 st_blksize; u32 st_blocks; u32 st_atime; u32 st_atime_nsec; u32 st_mtime; u32 st_mtime_nsec; u32 st_ctime; u32 st_ctime_nsec; u32 __unused4; u32 __unused5; }; /* * IA32 uses 4 byte alignment for 64 bit quantities, so we need to pack the * compat flock64 structure. */ #define __ARCH_NEED_COMPAT_FLOCK64_PACKED struct compat_statfs { int f_type; int f_bsize; int f_blocks; int f_bfree; int f_bavail; int f_files; int f_ffree; compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; int f_flags; int f_spare[4]; }; #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) #endif static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) return true; #endif return false; } static inline bool in_32bit_syscall(void) { return in_ia32_syscall() || in_x32_syscall(); } #ifdef CONFIG_COMPAT static inline bool in_compat_syscall(void) { return in_32bit_syscall(); } #define in_compat_syscall in_compat_syscall /* override the generic impl */ #define compat_need_64bit_alignment_fixup in_ia32_syscall #endif struct compat_siginfo; #ifdef CONFIG_X86_X32_ABI int copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginfo_t *from); #define copy_siginfo_to_user32 copy_siginfo_to_user32 #endif /* CONFIG_X86_X32_ABI */ #endif /* _ASM_X86_COMPAT_H */
106 106 14 25 25 25 21 4 25 105 106 106 85 85 40 8 85 85 49 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include "percpu_freelist.h" int pcpu_freelist_init(struct pcpu_freelist *s) { int cpu; s->freelist = alloc_percpu(struct pcpu_freelist_head); if (!s->freelist) return -ENOMEM; for_each_possible_cpu(cpu) { struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); raw_spin_lock_init(&head->lock); head->first = NULL; } raw_spin_lock_init(&s->extralist.lock); s->extralist.first = NULL; return 0; } void pcpu_freelist_destroy(struct pcpu_freelist *s) { free_percpu(s->freelist); } static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { node->next = head->first; WRITE_ONCE(head->first, node); } static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { raw_spin_lock(&head->lock); pcpu_freelist_push_node(head, node); raw_spin_unlock(&head->lock); } static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { if (!raw_spin_trylock(&s->extralist.lock)) return false; pcpu_freelist_push_node(&s->extralist, node); raw_spin_unlock(&s->extralist.lock); return true; } static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { int cpu, orig_cpu; orig_cpu = raw_smp_processor_id(); while (1) { for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { struct pcpu_freelist_head *head; head = per_cpu_ptr(s->freelist, cpu); if (raw_spin_trylock(&head->lock)) { pcpu_freelist_push_node(head, node); raw_spin_unlock(&head->lock); return; } } /* cannot lock any per cpu lock, try extralist */ if (pcpu_freelist_try_push_extra(s, node)) return; } } void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { if (in_nmi()) ___pcpu_freelist_push_nmi(s, node); else ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); } void pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { unsigned long flags; local_irq_save(flags); __pcpu_freelist_push(s, node); local_irq_restore(flags); } void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems) { struct pcpu_freelist_head *head; unsigned int cpu, cpu_idx, i, j, n, m; n = nr_elems / num_possible_cpus(); m = nr_elems % num_possible_cpus(); cpu_idx = 0; for_each_possible_cpu(cpu) { head = per_cpu_ptr(s->freelist, cpu); j = n + (cpu_idx < m ? 1 : 0); for (i = 0; i < j; i++) { /* No locking required as this is not visible yet. */ pcpu_freelist_push_node(head, buf); buf += elem_size; } cpu_idx++; } } static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; raw_spin_lock(&head->lock); node = head->first; if (node) { WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); } /* per cpu lists are all empty, try extralist */ if (!READ_ONCE(s->extralist.first)) return NULL; raw_spin_lock(&s->extralist.lock); node = s->extralist.first; if (node) WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } static struct pcpu_freelist_node * ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; if (raw_spin_trylock(&head->lock)) { node = head->first; if (node) { WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); } } /* cannot pop from per cpu lists, try extralist */ if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) return NULL; node = s->extralist.first; if (node) WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { if (in_nmi()) return ___pcpu_freelist_pop_nmi(s); return ___pcpu_freelist_pop(s); } struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_node *ret; unsigned long flags; local_irq_save(flags); ret = __pcpu_freelist_pop(s); local_irq_restore(flags); return ret; }
626 627 629 627 629 629 130 625 627 627 225 70 125 125 125 100 7 52 130 16 125 130 130 130 70 70 70 70 628 617 629 629 130 629 629 629 629 629 629 629 629 629 115 590 629 627 629 629 629 629 629 628 629 151 627 627 627 627 629 151 609 70 130 629 626 151 629 626 151 628 129 629 130 1 629 628 629 629 629 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 // SPDX-License-Identifier: GPL-2.0 /* * Generic ring buffer * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ #include <linux/trace_recursion.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> #include <linux/sched/clock.h> #include <linux/cacheflush.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/irq_work.h> #include <linux/security.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kthread.h> /* for self test */ #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/list.h> #include <linux/cpu.h> #include <linux/oom.h> #include <linux/mm.h> #include <asm/local64.h> #include <asm/local.h> #include "trace.h" /* * The "absolute" timestamp in the buffer is only 59 bits. * If a clock has the 5 MSBs set, it needs to be saved and * reinserted. */ #define TS_MSB (0xf8ULL << 56) #define ABS_TS_MASK (~TS_MSB) static void update_pages_handler(struct work_struct *work); #define RING_BUFFER_META_MAGIC 0xBADFEED struct ring_buffer_meta { int magic; int struct_size; unsigned long text_addr; unsigned long data_addr; unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; __u32 subbuf_size; __u32 nr_subbufs; int buffers[]; }; /* * The ring buffer header is special. We must manually up keep it. */ int ring_buffer_print_entry_header(struct trace_seq *s) { trace_seq_puts(s, "# compressed entry header\n"); trace_seq_puts(s, "\ttype_len : 5 bits\n"); trace_seq_puts(s, "\ttime_delta : 27 bits\n"); trace_seq_puts(s, "\tarray : 32 bits\n"); trace_seq_putc(s, '\n'); trace_seq_printf(s, "\tpadding : type == %d\n", RINGBUF_TYPE_PADDING); trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); trace_seq_printf(s, "\ttime_stamp : type == %d\n", RINGBUF_TYPE_TIME_STAMP); trace_seq_printf(s, "\tdata max type_len == %d\n", RINGBUF_TYPE_DATA_TYPE_LEN_MAX); return !trace_seq_has_overflowed(s); } /* * The ring buffer is made up of a list of pages. A separate list of pages is * allocated for each CPU. A writer may only write to a buffer that is * associated with the CPU it is currently executing on. A reader may read * from any per cpu buffer. * * The reader is special. For each per cpu buffer, the reader has its own * reader page. When a reader has read the entire reader page, this reader * page is swapped with another page in the ring buffer. * * Now, as long as the writer is off the reader page, the reader can do what * ever it wants with that page. The writer will never write to that page * again (as long as it is out of the ring buffer). * * Here's some silly ASCII art. * * +------+ * |reader| RING BUFFER * |page | * +------+ +---+ +---+ +---+ * | |-->| |-->| | * +---+ +---+ +---+ * ^ | * | | * +---------------+ * * * +------+ * |reader| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * | |-->| |-->| | * +---+ +---+ +---+ * ^ | * | | * +---------------+ * * * +------+ * |reader| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * ^ | |-->| |-->| | * | +---+ +---+ +---+ * | | * | | * +------------------------------+ * * * +------+ * |buffer| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * ^ | | | |-->| | * | New +---+ +---+ +---+ * | Reader------^ | * | page | * +------------------------------+ * * * After we make this swap, the reader can hand this page off to the splice * code and be done with it. It can even allocate a new page if it needs to * and swap that into the ring buffer. * * We will be using cmpxchg soon to make all this lockless. * */ /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ #ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS # define RB_FORCE_8BYTE_ALIGNMENT 0 # define RB_ARCH_ALIGNMENT RB_ALIGNMENT #else # define RB_FORCE_8BYTE_ALIGNMENT 1 # define RB_ARCH_ALIGNMENT 8U #endif #define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX enum { RB_LEN_TIME_EXTEND = 8, RB_LEN_TIME_STAMP = 8, }; #define skip_time_extend(event) \ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) #define extended_time(event) \ (event->type_len >= RINGBUF_TYPE_TIME_EXTEND) static inline bool rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { /* padding has a NULL time_delta */ event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } static unsigned rb_event_data_length(struct ring_buffer_event *event) { unsigned length; if (event->type_len) length = event->type_len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; } /* * Return the length of the given event. Will return * the length of the time extend if the event is a * time extend. */ static inline unsigned rb_event_length(struct ring_buffer_event *event) { switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) /* undefined */ return -1; return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; case RINGBUF_TYPE_TIME_STAMP: return RB_LEN_TIME_STAMP; case RINGBUF_TYPE_DATA: return rb_event_data_length(event); default: WARN_ON_ONCE(1); } /* not hit */ return 0; } /* * Return total length of time extend and data, * or just the event length for all other events. */ static inline unsigned rb_event_ts_length(struct ring_buffer_event *event) { unsigned len = 0; if (extended_time(event)) { /* time extends include the data event after it */ len = RB_LEN_TIME_EXTEND; event = skip_time_extend(event); } return len + rb_event_length(event); } /** * ring_buffer_event_length - return the length of the event * @event: the event to get the length of * * Returns the size of the data load of a data event. * If the event is something other than a data event, it * returns the size of the event itself. With the exception * of a TIME EXTEND, where it still returns the size of the * data load of the data event after it. */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length; if (extended_time(event)) event = skip_time_extend(event); length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) length -= sizeof(event->array[0]); return length; } EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */ static __always_inline void * rb_event_data(struct ring_buffer_event *event) { if (extended_time(event)) event = skip_time_extend(event); WARN_ON_ONCE(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; } /** * ring_buffer_event_data - return the data of the event * @event: the event to get the data from */ void *ring_buffer_event_data(struct ring_buffer_event *event) { return rb_event_data(event); } EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ for_each_cpu(cpu, buffer->cpumask) #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; ts = event->array[0]; ts <<= TS_SHIFT; ts += event->time_delta; return ts; } /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) #define RB_MISSED_MASK (3 << 30) struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; struct buffer_data_read_page { unsigned order; /* order of the page */ struct buffer_data_page *data; /* actual data, stored in this page */ }; /* * Note, the buffer_page list must be first. The buffer pages * are allocated in cache lines, which means that each buffer * page will be at the beginning of a cache line, and thus * the least significant bits will be zero. We use this to * add flags in the list struct pointers, to make the ring buffer * lockless. */ struct buffer_page { struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ unsigned order; /* order of the page */ u32 id:30; /* ID for external mapping */ u32 range:1; /* Mapped via a range */ struct buffer_data_page *page; /* Actual data page */ }; /* * The buffer page counters, write and entries, must be reset * atomically when crossing page boundaries. To synchronize this * update, two counters are inserted into the number. One is * the actual counter for the write position or count on the page. * * The other is a counter of updaters. Before an update happens * the update partition of the counter is incremented. This will * allow the updater to update the counter atomically. * * The counter is 20 bits, and the state data is 12. */ #define RB_WRITE_MASK 0xfffff #define RB_WRITE_INTCNT (1 << 20) static void rb_init_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); } static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) { return local_read(&bpage->page->commit); } static void free_buffer_page(struct buffer_page *bpage) { /* Range pages are not to be freed */ if (!bpage->range) free_pages((unsigned long)bpage->page, bpage->order); kfree(bpage); } /* * We need to fit the time_stamp delta into 27 bits. */ static inline bool test_time_stamp(u64 delta) { return !!(delta & TS_DELTA_TEST); } struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; atomic_t seq; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; }; /* * Structure to hold event state and handle nested events. */ struct rb_event_info { u64 ts; u64 delta; u64 before; u64 after; unsigned long length; struct buffer_page *tail_page; int add_timestamp; }; /* * Used for the add_timestamp * NONE * EXTEND - wants a time extend * ABSOLUTE - the buffer requests all events to have absolute time stamps * FORCE - force a full time stamp. */ enum { RB_ADD_STAMP_NONE = 0, RB_ADD_STAMP_EXTEND = BIT(1), RB_ADD_STAMP_ABSOLUTE = BIT(2), RB_ADD_STAMP_FORCE = BIT(3) }; /* * Used for which event context the event is in. * TRANSITION = 0 * NMI = 1 * IRQ = 2 * SOFTIRQ = 3 * NORMAL = 4 * * See trace_recursive_lock() comment below for more details. */ enum { RB_CTX_TRANSITION, RB_CTX_NMI, RB_CTX_IRQ, RB_CTX_SOFTIRQ, RB_CTX_NORMAL, RB_CTX_MAX }; struct rb_time_struct { local64_t time; }; typedef struct rb_time_struct rb_time_t; #define MAX_NEST 5 /* * head_page == tail_page && head == tail then buffer is empty. */ struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; atomic_t resize_disabled; struct trace_buffer *buffer; raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; struct buffer_data_page *free_page; unsigned long nr_pages; unsigned int current_context; struct list_head *pages; /* pages generation counter, incremented when the list changes */ unsigned long cnt; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; unsigned long nest; local_t entries_bytes; local_t entries; local_t overrun; local_t commit_overrun; local_t dropped_events; local_t committing; local_t commits; local_t pages_touched; local_t pages_lost; local_t pages_read; long last_pages_touch; size_t shortest_full; unsigned long read; unsigned long read_bytes; rb_time_t write_stamp; rb_time_t before_stamp; u64 event_stamp[MAX_NEST]; u64 read_stamp; /* pages removed since last reset */ unsigned long pages_removed; unsigned int mapped; unsigned int user_mapped; /* user space mapping */ struct mutex mapping_lock; unsigned long *subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; struct ring_buffer_meta *ring_meta; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; struct rb_irq_work irq_work; }; struct trace_buffer { unsigned flags; int cpus; atomic_t record_disabled; atomic_t resizing; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; struct mutex mutex; struct ring_buffer_per_cpu **buffers; struct hlist_node node; u64 (*clock)(void); struct rb_irq_work irq_work; bool time_stamp_abs; unsigned long range_addr_start; unsigned long range_addr_end; long last_text_delta; long last_data_delta; unsigned int subbuf_size; unsigned int subbuf_order; unsigned int max_data_size; }; struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; unsigned long next_event; struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; unsigned long cache_pages_removed; u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; size_t event_size; int missed_events; }; int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s) { struct buffer_data_page field; trace_seq_printf(s, "\tfield: u64 timestamp;\t" "offset:0;\tsize:%u;\tsigned:%u;\n", (unsigned int)sizeof(field.time_stamp), (unsigned int)is_signed_type(u64)); trace_seq_printf(s, "\tfield: local_t commit;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), (unsigned int)sizeof(field.commit), (unsigned int)is_signed_type(long)); trace_seq_printf(s, "\tfield: int overwrite;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), 1, (unsigned int)is_signed_type(long)); trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), (unsigned int)buffer->subbuf_size, (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); } static inline void rb_time_read(rb_time_t *t, u64 *ret) { *ret = local64_read(&t->time); } static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } /* * Enable this to make sure that the event passed to * ring_buffer_event_time_stamp() is not committed and also * is on the buffer that it passed in. */ //#define RB_VERIFY_EVENT #ifdef RB_VERIFY_EVENT static struct list_head *rb_list_head(struct list_head *list); static void verify_event(struct ring_buffer_per_cpu *cpu_buffer, void *event) { struct buffer_page *page = cpu_buffer->commit_page; struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page); struct list_head *next; long commit, write; unsigned long addr = (unsigned long)event; bool done = false; int stop = 0; /* Make sure the event exists and is not committed yet */ do { if (page == tail_page || WARN_ON_ONCE(stop++ > 100)) done = true; commit = local_read(&page->page->commit); write = local_read(&page->write); if (addr >= (unsigned long)&page->page->data[commit] && addr < (unsigned long)&page->page->data[write]) return; next = rb_list_head(page->list.next); page = list_entry(next, struct buffer_page, list); } while (!done); WARN_ON_ONCE(1); } #else static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, void *event) { } #endif /* * The absolute time stamp drops the 5 MSBs and some clocks may * require them. The rb_fix_abs_ts() will take a previous full * time stamp, and add the 5 MSB of that time stamp on to the * saved absolute time stamp. Then they are compared in case of * the unlikely event that the latest time stamp incremented * the 5 MSB. */ static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts) { if (save_ts & TS_MSB) { abs |= save_ts & TS_MSB; /* Check for overflow */ if (unlikely(abs < save_ts)) abs += 1ULL << 59; } return abs; } static inline u64 rb_time_stamp(struct trace_buffer *buffer); /** * ring_buffer_event_time_stamp - return the event's current time stamp * @buffer: The buffer that the event is on * @event: the event to get the time stamp of * * Note, this must be called after @event is reserved, and before it is * committed to the ring buffer. And must be called from the same * context where the event was reserved (normal, softirq, irq, etc). * * Returns the time stamp associated with the current event. * If the event has an extended time stamp, then that is used as * the time stamp to return. * In the highly unlikely case that the event was nested more than * the max nesting, then the write_stamp of the buffer is returned, * otherwise current time is returned, but that really neither of * the last two cases should ever happen. */ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()]; unsigned int nest; u64 ts; /* If the event includes an absolute time, then just use that */ if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { ts = rb_event_time_stamp(event); return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp); } nest = local_read(&cpu_buffer->committing); verify_event(cpu_buffer, event); if (WARN_ON_ONCE(!nest)) goto fail; /* Read the current saved nesting level time stamp */ if (likely(--nest < MAX_NEST)) return cpu_buffer->event_stamp[nest]; /* Shouldn't happen, warn if it does */ WARN_ONCE(1, "nest (%d) greater than max", nest); fail: rb_time_read(&cpu_buffer->write_stamp, &ts); return ts; } /** * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from * * Returns the number of pages that have content in the ring buffer. */ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) { size_t read; size_t lost; size_t cnt; read = local_read(&buffer->buffers[cpu]->pages_read); lost = local_read(&buffer->buffers[cpu]->pages_lost); cnt = local_read(&buffer->buffers[cpu]->pages_touched); if (WARN_ON_ONCE(cnt < lost)) return 0; cnt -= lost; /* The reader can read an empty page, but not more than that */ if (cnt < read) { WARN_ON_ONCE(read > cnt + 1); return 0; } return cnt - read; } static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; size_t nr_pages; size_t dirty; nr_pages = cpu_buffer->nr_pages; if (!nr_pages || !full) return true; /* * Add one as dirty will never equal nr_pages, as the sub-buffer * that the writer is on is not counted as dirty. * This is needed if "buffer_percent" is set to 100. */ dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1; return (dirty * 100) >= (full * nr_pages); } /* * rb_wake_up_waiters - wake up tasks waiting for ring buffer input * * Schedules a delayed work to wake up any task that is blocked on the * ring buffer waiters queue. */ static void rb_wake_up_waiters(struct irq_work *work) { struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); /* For waiters waiting for the first wake up */ (void)atomic_fetch_inc_release(&rbwork->seq); wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { /* Only cpu_buffer sets the above flags */ struct ring_buffer_per_cpu *cpu_buffer = container_of(rbwork, struct ring_buffer_per_cpu, irq_work); /* Called from interrupt context */ raw_spin_lock(&cpu_buffer->reader_lock); rbwork->wakeup_full = false; rbwork->full_waiters_pending = false; /* Waking up all waiters, they will reset the shortest full */ cpu_buffer->shortest_full = 0; raw_spin_unlock(&cpu_buffer->reader_lock); wake_up_all(&rbwork->full_waiters); } } /** * ring_buffer_wake_waiters - wake up any waiters on this ring buffer * @buffer: The ring buffer to wake waiters on * @cpu: The CPU buffer to wake waiters on * * In the case of a file that represents a ring buffer is closing, * it is prudent to wake up any waiters that are on this. */ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *rbwork; if (!buffer) return; if (cpu == RING_BUFFER_ALL_CPUS) { /* Wake up individual ones too. One level recursion */ for_each_buffer_cpu(buffer, cpu) ring_buffer_wake_waiters(buffer, cpu); rbwork = &buffer->irq_work; } else { if (WARN_ON_ONCE(!buffer->buffers)) return; if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return; cpu_buffer = buffer->buffers[cpu]; /* The CPU buffer may not have been initialized yet */ if (!cpu_buffer) return; rbwork = &cpu_buffer->irq_work; } /* This can be called in any context */ irq_work_queue(&rbwork->work); } static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer; bool ret = false; /* Reads of all CPUs always waits for any data */ if (cpu == RING_BUFFER_ALL_CPUS) return !ring_buffer_empty(buffer); cpu_buffer = buffer->buffers[cpu]; if (!ring_buffer_empty_cpu(buffer, cpu)) { unsigned long flags; bool pagebusy; if (!full) return true; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; ret = !pagebusy && full_hit(buffer, cpu, full); if (!ret && (!cpu_buffer->shortest_full || cpu_buffer->shortest_full > full)) { cpu_buffer->shortest_full = full; } raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } return ret; } static inline bool rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer, int cpu, int full, ring_buffer_cond_fn cond, void *data) { if (rb_watermark_hit(buffer, cpu, full)) return true; if (cond(data)) return true; /* * The events can happen in critical sections where * checking a work queue can cause deadlocks. * After adding a task to the queue, this flag is set * only to notify events to try to wake up the queue * using irq_work. * * We don't clear it even if the buffer is no longer * empty. The flag only causes the next event to run * irq_work to do the work queue wake up. The worse * that can happen if we race with !trace_empty() is that * an event will cause an irq_work to try to wake up * an empty queue. * * There's no reason to protect this flag either, as * the work queue and irq_work logic will do the necessary * synchronization for the wake ups. The only thing * that is necessary is that the wake up happens after * a task has been queued. It's OK for spurious wake ups. */ if (full) rbwork->full_waiters_pending = true; else rbwork->waiters_pending = true; return false; } struct rb_wait_data { struct rb_irq_work *irq_work; int seq; }; /* * The default wait condition for ring_buffer_wait() is to just to exit the * wait loop the first time it is woken up. */ static bool rb_wait_once(void *data) { struct rb_wait_data *rdata = data; struct rb_irq_work *rbwork = rdata->irq_work; return atomic_read_acquire(&rbwork->seq) != rdata->seq; } /** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * @cond: condition function to break out of wait (NULL to run once) * @data: the data to pass to @cond. * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, ring_buffer_cond_fn cond, void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct wait_queue_head *waitq; struct rb_irq_work *rbwork; struct rb_wait_data rdata; int ret = 0; /* * Depending on what the caller is waiting for, either any * data in any cpu buffer, or a specific buffer, put the * caller on the appropriate wait queue. */ if (cpu == RING_BUFFER_ALL_CPUS) { rbwork = &buffer->irq_work; /* Full only makes sense on per cpu reads */ full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; cpu_buffer = buffer->buffers[cpu]; rbwork = &cpu_buffer->irq_work; } if (full) waitq = &rbwork->full_waiters; else waitq = &rbwork->waiters; /* Set up to exit loop as soon as it is woken */ if (!cond) { cond = rb_wait_once; rdata.irq_work = rbwork; rdata.seq = atomic_read_acquire(&rbwork->seq); data = &rdata; } ret = wait_event_interruptible((*waitq), rb_wait_cond(rbwork, buffer, cpu, full, cond, data)); return ret; } /** * ring_buffer_poll_wait - poll on buffer input * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on * @filp: the file descriptor * @poll_table: The poll descriptor * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. * * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers, * zero otherwise. */ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table, int full) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *rbwork; if (cpu == RING_BUFFER_ALL_CPUS) { rbwork = &buffer->irq_work; full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return EPOLLERR; cpu_buffer = buffer->buffers[cpu]; rbwork = &cpu_buffer->irq_work; } if (full) { poll_wait(filp, &rbwork->full_waiters, poll_table); if (rb_watermark_hit(buffer, cpu, full)) return EPOLLIN | EPOLLRDNORM; /* * Only allow full_waiters_pending update to be seen after * the shortest_full is set (in rb_watermark_hit). If the * writer sees the full_waiters_pending flag set, it will * compare the amount in the ring buffer to shortest_full. * If the amount in the ring buffer is greater than the * shortest_full percent, it will call the irq_work handler * to wake up this list. The irq_handler will reset shortest_full * back to zero. That's done under the reader_lock, but * the below smp_mb() makes sure that the update to * full_waiters_pending doesn't leak up into the above. */ smp_mb(); rbwork->full_waiters_pending = true; return 0; } poll_wait(filp, &rbwork->waiters, poll_table); rbwork->waiters_pending = true; /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit * is set, the next event will wake the task up, but we can get stuck * if there's only a single event in. * * FIXME: Ideally, we need a memory barrier on the writer side as well, * but adding a memory barrier to all events will cause too much of a * performance hit in the fast path. We only need a memory barrier when * the buffer goes from empty to having content. But as this race is * extremely small, and it's not a problem if another event comes in, we * will fix it later. */ smp_mb(); if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return EPOLLIN | EPOLLRDNORM; return 0; } /* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(b, cond) \ ({ \ int _____ret = unlikely(cond); \ if (_____ret) { \ if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ struct ring_buffer_per_cpu *__b = \ (void *)b; \ atomic_inc(&__b->buffer->record_disabled); \ } else \ atomic_inc(&b->record_disabled); \ WARN_ON(1); \ } \ _____ret; \ }) /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 static inline u64 rb_time_stamp(struct trace_buffer *buffer) { u64 ts; /* Skip retpolines :-( */ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && likely(buffer->clock == trace_clock_local)) ts = trace_clock_local(); else ts = buffer->clock(); /* shift to debug/test normalization and TIME_EXTENTS */ return ts << DEBUG_SHIFT; } u64 ring_buffer_time_stamp(struct trace_buffer *buffer) { u64 time; preempt_disable_notrace(); time = rb_time_stamp(buffer); preempt_enable_notrace(); return time; } EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer, int cpu, u64 *ts) { /* Just stupid testing the normalize function and deltas */ *ts >>= DEBUG_SHIFT; } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); /* * Making the ring buffer lockless makes things tricky. * Although writes only happen on the CPU that they are on, * and they only need to worry about interrupts. Reads can * happen on any CPU. * * The reader page is always off the ring buffer, but when the * reader finishes with a page, it needs to swap its page with * a new one from the buffer. The reader needs to take from * the head (writes go to the tail). But if a writer is in overwrite * mode and wraps, it must push the head page forward. * * Here lies the problem. * * The reader must be careful to replace only the head page, and * not another one. As described at the top of the file in the * ASCII art, the reader sets its old page to point to the next * page after head. It then sets the page after head to point to * the old reader page. But if the writer moves the head page * during this operation, the reader could end up with the tail. * * We use cmpxchg to help prevent this race. We also do something * special with the page before head. We set the LSB to 1. * * When the writer must push the page forward, it will clear the * bit that points to the head page, move the head, and then set * the bit that points to the new head page. * * We also don't want an interrupt coming in and moving the head * page on another writer. Thus we use the second LSB to catch * that too. Thus: * * head->list->prev->next bit 1 bit 0 * ------- ------- * Normal page 0 0 * Points to head page 0 1 * New head page 1 0 * * Note we can not trust the prev pointer of the head page, because: * * +----+ +-----+ +-----+ * | |------>| T |---X--->| N | * | |<------| | | | * +----+ +-----+ +-----+ * ^ ^ | * | +-----+ | | * +----------| R |----------+ | * | |<-----------+ * +-----+ * * Key: ---X--> HEAD flag set in pointer * T Tail page * R Reader page * N Next page * * (see __rb_reserve_next() to see where this happens) * * What the above shows is that the reader just swapped out * the reader page with a page in the buffer, but before it * could make the new header point back to the new page added * it was preempted by a writer. The writer moved forward onto * the new page added by the reader and is about to move forward * again. * * You can see, it is legitimate for the previous pointer of * the head (or any page) not to point back to itself. But only * temporarily. */ #define RB_PAGE_NORMAL 0UL #define RB_PAGE_HEAD 1UL #define RB_PAGE_UPDATE 2UL #define RB_FLAG_MASK 3UL /* PAGE_MOVED is not part of the mask */ #define RB_PAGE_MOVED 4UL /* * rb_list_head - remove any bit */ static struct list_head *rb_list_head(struct list_head *list) { unsigned long val = (unsigned long)list; return (struct list_head *)(val & ~RB_FLAG_MASK); } /* * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to * the reader page). But if the next page is a header page, * its flags will be non zero. */ static inline int rb_is_head_page(struct buffer_page *page, struct list_head *list) { unsigned long val; val = (unsigned long)list->next; if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) return RB_PAGE_MOVED; return val & RB_FLAG_MASK; } /* * rb_is_reader_page * * The unique thing about the reader page, is that, if the * writer is ever on it, the previous pointer never points * back to the reader page. */ static bool rb_is_reader_page(struct buffer_page *page) { struct list_head *list = page->list.prev; return rb_list_head(list->next) != &page->list; } /* * rb_set_list_to_head - set a list_head to be pointing to head. */ static void rb_set_list_to_head(struct list_head *list) { unsigned long *ptr; ptr = (unsigned long *)&list->next; *ptr |= RB_PAGE_HEAD; *ptr &= ~RB_PAGE_UPDATE; } /* * rb_head_page_activate - sets up head page */ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *head; head = cpu_buffer->head_page; if (!head) return; /* * Set the previous list pointer to have the HEAD flag. */ rb_set_list_to_head(head->list.prev); if (cpu_buffer->ring_meta) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; meta->head_buffer = (unsigned long)head->page; } } static void rb_list_head_clear(struct list_head *list) { unsigned long *ptr = (unsigned long *)&list->next; *ptr &= ~RB_FLAG_MASK; } /* * rb_head_page_deactivate - clears head page ptr (for free list) */ static void rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *hd; /* Go through the whole list and clear any pointers found. */ rb_list_head_clear(cpu_buffer->pages); list_for_each(hd, cpu_buffer->pages) rb_list_head_clear(hd); } static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag, int new_flag) { struct list_head *list; unsigned long val = (unsigned long)&head->list; unsigned long ret; list = &prev->list; val &= ~RB_FLAG_MASK; ret = cmpxchg((unsigned long *)&list->next, val | old_flag, val | new_flag); /* check if the reader took the page */ if ((ret & ~RB_FLAG_MASK) != val) return RB_PAGE_MOVED; return ret & RB_FLAG_MASK; } static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_UPDATE); } static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_HEAD); } static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_NORMAL); } static inline void rb_inc_page(struct buffer_page **bpage) { struct list_head *p = rb_list_head((*bpage)->list.next); *bpage = list_entry(p, struct buffer_page, list); } static struct buffer_page * rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *head; struct buffer_page *page; struct list_head *list; int i; if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) return NULL; /* sanity check */ list = cpu_buffer->pages; if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) return NULL; page = head = cpu_buffer->head_page; /* * It is possible that the writer moves the header behind * where we started, and we miss in one loop. * A second loop should grab the header, but we'll do * three loops just because I'm paranoid. */ for (i = 0; i < 3; i++) { do { if (rb_is_head_page(page, page->list.prev)) { cpu_buffer->head_page = page; return page; } rb_inc_page(&page); } while (page != head); } RB_WARN_ON(cpu_buffer, 1); return NULL; } static bool rb_head_page_replace(struct buffer_page *old, struct buffer_page *new) { unsigned long *ptr = (unsigned long *)&old->list.prev->next; unsigned long val; val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; return try_cmpxchg(ptr, &val, (unsigned long)&new->list); } /* * rb_tail_page_update - move the tail page forward */ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { unsigned long old_entries; unsigned long old_write; /* * The tail page now needs to be moved forward. * * We need to reset the tail page, but without messing * with possible erasing of data brought in by interrupts * that have moved the tail page and are currently on it. * * We add a counter to the write field to denote this. */ old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. */ barrier(); /* * If the tail page is still the same as what we think * it is, then it is up to us to update the tail * pointer. */ if (tail_page == READ_ONCE(cpu_buffer->tail_page)) { /* Zero the write counter */ unsigned long val = old_write & ~RB_WRITE_MASK; unsigned long eval = old_entries & ~RB_WRITE_MASK; /* * This will only succeed if an interrupt did * not come in and change it. In which case, we * do not want to modify it. * * We add (void) to let the compiler know that we do not care * about the return value of these functions. We use the * cmpxchg to only update if an interrupt did not already * do it for us. If the cmpxchg fails, we don't care. */ (void)local_cmpxchg(&next_page->write, old_write, val); (void)local_cmpxchg(&next_page->entries, old_entries, eval); /* * No need to worry about races with clearing out the commit. * it only can increment when a commit takes place. But that * only happens in the outer most nested commit. */ local_set(&next_page->page->commit, 0); /* Either we update tail_page or an interrupt does */ if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page)) local_inc(&cpu_buffer->pages_touched); } } static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { unsigned long val = (unsigned long)bpage; RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK); } static bool rb_check_links(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *list) { if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(list->next)->prev) != list)) return false; if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(list->prev)->next) != list)) return false; return true; } /** * rb_check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * * As a safety measure we check to make sure the data pages have not * been corrupted. */ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head, *tmp; unsigned long buffer_cnt; unsigned long flags; int nr_loops = 0; /* * Walk the linked list underpinning the ring buffer and validate all * its next and prev links. * * The check acquires the reader_lock to avoid concurrent processing * with code that could be modifying the list. However, the lock cannot * be held for the entire duration of the walk, as this would make the * time when interrupts are disabled non-deterministic, dependent on the * ring buffer size. Therefore, the code releases and re-acquires the * lock after checking each page. The ring_buffer_per_cpu.cnt variable * is then used to detect if the list was modified while the lock was * not held, in which case the check needs to be restarted. * * The code attempts to perform the check at most three times before * giving up. This is acceptable because this is only a self-validation * to detect problems early on. In practice, the list modification * operations are fairly spaced, and so this check typically succeeds at * most on the second try. */ again: if (++nr_loops > 3) return; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); head = rb_list_head(cpu_buffer->pages); if (!rb_check_links(cpu_buffer, head)) goto out_locked; buffer_cnt = cpu_buffer->cnt; tmp = head; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); while (true) { raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (buffer_cnt != cpu_buffer->cnt) { /* The list was updated, try again. */ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); goto again; } tmp = rb_list_head(tmp->next); if (tmp == head) /* The iteration circled back, all is done. */ goto out_locked; if (!rb_check_links(cpu_buffer, tmp)) goto out_locked; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } out_locked: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /* * Take an address, add the meta data size as well as the array of * array subbuffer indexes, then align it to a subbuffer size. * * This is used to help find the next per cpu subbuffer within a mapped range. */ static unsigned long rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) { addr += sizeof(struct ring_buffer_meta) + sizeof(int) * nr_subbufs; return ALIGN(addr, subbuf_size); } /* * Return the ring_buffer_meta for a given @cpu. */ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) { int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; unsigned long ptr = buffer->range_addr_start; struct ring_buffer_meta *meta; int nr_subbufs; if (!ptr) return NULL; /* When nr_pages passed in is zero, the first meta has already been initialized */ if (!nr_pages) { meta = (struct ring_buffer_meta *)ptr; nr_subbufs = meta->nr_subbufs; } else { meta = NULL; /* Include the reader page */ nr_subbufs = nr_pages + 1; } /* * The first chunk may not be subbuffer aligned, where as * the rest of the chunks are. */ if (cpu) { ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); ptr += subbuf_size * nr_subbufs; /* We can use multiplication to find chunks greater than 1 */ if (cpu > 1) { unsigned long size; unsigned long p; /* Save the beginning of this CPU chunk */ p = ptr; ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); ptr += subbuf_size * nr_subbufs; /* Now all chunks after this are the same size */ size = ptr - p; ptr += size * (cpu - 2); } } return (void *)ptr; } /* Return the start of subbufs given the meta pointer */ static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) { int subbuf_size = meta->subbuf_size; unsigned long ptr; ptr = (unsigned long)meta; ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs); return (void *)ptr; } /* * Return a specific sub-buffer for a given @cpu defined by @idx. */ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) { struct ring_buffer_meta *meta; unsigned long ptr; int subbuf_size; meta = rb_range_meta(cpu_buffer->buffer, 0, cpu_buffer->cpu); if (!meta) return NULL; if (WARN_ON_ONCE(idx >= meta->nr_subbufs)) return NULL; subbuf_size = meta->subbuf_size; /* Map this buffer to the order that's in meta->buffers[] */ idx = meta->buffers[idx]; ptr = (unsigned long)rb_subbufs_from_meta(meta); ptr += subbuf_size * idx; if (ptr + subbuf_size > cpu_buffer->buffer->range_addr_end) return NULL; return (void *)ptr; } /* * See if the existing memory contains valid ring buffer data. * As the previous kernel must be the same as this kernel, all * the calculations (size of buffers and number of buffers) * must be the same. */ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, struct trace_buffer *buffer, int nr_pages) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; unsigned long buffers_start; unsigned long buffers_end; int i; /* Check the meta magic and meta struct size */ if (meta->magic != RING_BUFFER_META_MAGIC || meta->struct_size != sizeof(*meta)) { pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu); return false; } /* The subbuffer's size and number of subbuffers must match */ if (meta->subbuf_size != subbuf_size || meta->nr_subbufs != nr_pages + 1) { pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu); return false; } buffers_start = meta->first_buffer; buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); /* Is the head and commit buffers within the range of buffers? */ if (meta->head_buffer < buffers_start || meta->head_buffer >= buffers_end) { pr_info("Ring buffer boot meta [%d] head buffer out of range\n", cpu); return false; } if (meta->commit_buffer < buffers_start || meta->commit_buffer >= buffers_end) { pr_info("Ring buffer boot meta [%d] commit buffer out of range\n", cpu); return false; } subbuf = rb_subbufs_from_meta(meta); /* Is the meta buffers and the subbufs themselves have correct data? */ for (i = 0; i < meta->nr_subbufs; i++) { if (meta->buffers[i] < 0 || meta->buffers[i] >= meta->nr_subbufs) { pr_info("Ring buffer boot meta [%d] array out of range\n", cpu); return false; } if ((unsigned)local_read(&subbuf->commit) > subbuf_size) { pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu); return false; } subbuf = (void *)subbuf + subbuf_size; } return true; } static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, unsigned long long *timestamp, u64 *delta_ptr) { struct ring_buffer_event *event; u64 ts, delta; int events = 0; int e; *delta_ptr = 0; *timestamp = 0; ts = dpage->time_stamp; for (e = 0; e < tail; e += rb_event_length(event)) { event = (struct ring_buffer_event *)(dpage->data + e); switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); delta = rb_fix_abs_ts(delta, ts); if (delta < ts) { *delta_ptr = delta; *timestamp = ts; return -1; } ts = delta; break; case RINGBUF_TYPE_PADDING: if (event->time_delta == 1) break; fallthrough; case RINGBUF_TYPE_DATA: events++; ts += event->time_delta; break; default: return -1; } } *timestamp = ts; return events; } static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) { unsigned long long ts; u64 delta; int tail; tail = local_read(&dpage->commit); return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta); } /* If the meta data has been validated, now validate the events */ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; struct buffer_page *head_page; unsigned long entry_bytes = 0; unsigned long entries = 0; int ret; int i; if (!meta || !meta->head_buffer) return; /* Do the reader page first */ ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu); if (ret < 0) { pr_info("Ring buffer reader page is invalid\n"); goto invalid; } entries += ret; entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); local_set(&cpu_buffer->reader_page->entries, ret); head_page = cpu_buffer->head_page; /* If both the head and commit are on the reader_page then we are done. */ if (head_page == cpu_buffer->reader_page && head_page == cpu_buffer->commit_page) goto done; /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { /* Reader page has already been done */ if (head_page == cpu_buffer->reader_page) continue; ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); if (ret < 0) { pr_info("Ring buffer meta [%d] invalid buffer page\n", cpu_buffer->cpu); goto invalid; } entries += ret; entry_bytes += local_read(&head_page->page->commit); local_set(&cpu_buffer->head_page->entries, ret); if (head_page == cpu_buffer->commit_page) break; } if (head_page != cpu_buffer->commit_page) { pr_info("Ring buffer meta [%d] commit page not found\n", cpu_buffer->cpu); goto invalid; } done: local_set(&cpu_buffer->entries, entries); local_set(&cpu_buffer->entries_bytes, entry_bytes); pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu); return; invalid: /* The content of the buffers are invalid, reset the meta data */ meta->head_buffer = 0; meta->commit_buffer = 0; /* Reset the reader page */ local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); /* Reset all the subbuffers */ for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) { local_set(&head_page->entries, 0); local_set(&head_page->page->commit, 0); } } /* Used to calculate data delta */ static char rb_data_ptr[] = ""; #define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr) #define THIS_DATA_PTR ((unsigned long)rb_data_ptr) static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) { meta->text_addr = THIS_TEXT_PTR; meta->data_addr = THIS_DATA_PTR; } static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; unsigned long delta; void *subbuf; int cpu; int i; for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); if (rb_meta_valid(meta, cpu, buffer, nr_pages)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr; buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr; continue; } if (cpu < nr_cpu_ids - 1) next_meta = rb_range_meta(buffer, nr_pages, cpu + 1); else next_meta = (void *)buffer->range_addr_end; memset(meta, 0, next_meta - (void *)meta); meta->magic = RING_BUFFER_META_MAGIC; meta->struct_size = sizeof(*meta); meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = PAGE_SIZE; subbuf = rb_subbufs_from_meta(meta); meta->first_buffer = (unsigned long)subbuf; rb_meta_init_text_addr(meta); /* * The buffers[] array holds the order of the sub-buffers * that are after the meta data. The sub-buffers may * be swapped out when read and inserted into a different * location of the ring buffer. Although their addresses * remain the same, the buffers[] array contains the * index into the sub-buffers holding their actual order. */ for (i = 0; i < meta->nr_subbufs; i++) { meta->buffers[i] = i; rb_init_page(subbuf); subbuf += meta->subbuf_size; } } } static void *rbm_start(struct seq_file *m, loff_t *pos) { struct ring_buffer_per_cpu *cpu_buffer = m->private; struct ring_buffer_meta *meta = cpu_buffer->ring_meta; unsigned long val; if (!meta) return NULL; if (*pos > meta->nr_subbufs) return NULL; val = *pos; val++; return (void *)val; } static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return rbm_start(m, pos); } static int rbm_show(struct seq_file *m, void *v) { struct ring_buffer_per_cpu *cpu_buffer = m->private; struct ring_buffer_meta *meta = cpu_buffer->ring_meta; unsigned long val = (unsigned long)v; if (val == 1) { seq_printf(m, "head_buffer: %d\n", rb_meta_subbuf_idx(meta, (void *)meta->head_buffer)); seq_printf(m, "commit_buffer: %d\n", rb_meta_subbuf_idx(meta, (void *)meta->commit_buffer)); seq_printf(m, "subbuf_size: %d\n", meta->subbuf_size); seq_printf(m, "nr_subbufs: %d\n", meta->nr_subbufs); return 0; } val -= 2; seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]); return 0; } static void rbm_stop(struct seq_file *m, void *p) { } static const struct seq_operations rb_meta_seq_ops = { .start = rbm_start, .next = rbm_next, .show = rbm_show, .stop = rbm_stop, }; int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu) { struct seq_file *m; int ret; ret = seq_open(file, &rb_meta_seq_ops); if (ret) return ret; m = file->private_data; m->private = buffer->buffers[cpu]; return 0; } /* Map the buffer_pages to the previous head and commit pages */ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; if (meta->head_buffer == (unsigned long)bpage->page) cpu_buffer->head_page = bpage; if (meta->commit_buffer == (unsigned long)bpage->page) { cpu_buffer->commit_page = bpage; cpu_buffer->tail_page = bpage; } } static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct trace_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; long i; /* * Check if the available memory is there first. * Note, si_mem_available() only gives us a rough estimate of available * memory. It may not be accurate. But we don't care, we just want * to prevent doing any allocation when it is obvious that it is * not going to succeed. */ i = si_mem_available(); if (i < nr_pages) return -ENOMEM; /* * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails * gracefully without invoking oom-killer and the system is not * destabilized. */ mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; /* * If a user thread allocates too much, and si_mem_available() * reports there's enough memory, even though there is not. * Make sure the OOM killer kills this thread. This can happen * even with RETRY_MAYFAIL because another task may be doing * an allocation after this task has taken all memory. * This is the task the OOM killer needs to take out during this * loop, even if it was triggered by an allocation somewhere else. */ if (user_thread) set_current_oom_origin(); if (buffer->range_addr_start) meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); for (i = 0; i < nr_pages; i++) { struct page *page; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), mflags, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; rb_check_bpage(cpu_buffer, bpage); /* * Append the pages as for mapped buffers we want to keep * the order */ list_add_tail(&bpage->list, pages); if (meta) { /* A range was given. Use that for the buffer page */ bpage->page = rb_range_buffer(cpu_buffer, i + 1); if (!bpage->page) goto free_pages; /* If this is valid from a previous boot */ if (meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; } else { page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; bpage->page = page_address(page); rb_init_page(bpage->page); } bpage->order = cpu_buffer->buffer->subbuf_order; if (user_thread && fatal_signal_pending(current)) goto free_pages; } if (user_thread) clear_current_oom_origin(); return 0; free_pages: list_for_each_entry_safe(bpage, tmp, pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } if (user_thread) clear_current_oom_origin(); return -ENOMEM; } static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { LIST_HEAD(pages); WARN_ON(!nr_pages); if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages)) return -ENOMEM; /* * The ring buffer page list is a circular list that does not * start and end with a list head. All page list items point to * other pages. */ cpu_buffer->pages = pages.next; list_del(&pages); cpu_buffer->nr_pages = nr_pages; rb_check_pages(cpu_buffer); return 0; } static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_meta *meta; struct buffer_page *bpage; struct page *page; int ret; cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!cpu_buffer) return NULL; cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); init_completion(&cpu_buffer->update_done); init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); mutex_init(&cpu_buffer->mapping_lock); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) goto fail_free_buffer; rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; if (buffer->range_addr_start) { /* * Range mapped buffers have the same restrictions as memory * mapped ones do. */ cpu_buffer->mapped = 1; cpu_buffer->ring_meta = rb_range_meta(buffer, nr_pages, cpu); bpage->page = rb_range_buffer(cpu_buffer, 0); if (!bpage->page) goto fail_free_reader; if (cpu_buffer->ring_meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; } else { page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; bpage->page = page_address(page); rb_init_page(bpage->page); } INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) goto fail_free_reader; rb_meta_validate_events(cpu_buffer); /* If the boot meta was valid then this has already been updated */ meta = cpu_buffer->ring_meta; if (!meta || !meta->head_buffer || !cpu_buffer->head_page || !cpu_buffer->commit_page || !cpu_buffer->tail_page) { if (meta && meta->head_buffer && (cpu_buffer->head_page || cpu_buffer->commit_page || cpu_buffer->tail_page)) { pr_warn("Ring buffer meta buffers not all mapped\n"); if (!cpu_buffer->head_page) pr_warn(" Missing head_page\n"); if (!cpu_buffer->commit_page) pr_warn(" Missing commit_page\n"); if (!cpu_buffer->tail_page) pr_warn(" Missing tail_page\n"); } cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; rb_head_page_activate(cpu_buffer); if (cpu_buffer->ring_meta) meta->commit_buffer = meta->head_buffer; } else { /* The valid meta buffer still needs to activate the head page */ rb_head_page_activate(cpu_buffer); } return cpu_buffer; fail_free_reader: free_buffer_page(cpu_buffer->reader_page); fail_free_buffer: kfree(cpu_buffer); return NULL; } static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; irq_work_sync(&cpu_buffer->irq_work.work); free_buffer_page(cpu_buffer->reader_page); if (head) { rb_head_page_deactivate(cpu_buffer); list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } bpage = list_entry(head, struct buffer_page, list); free_buffer_page(bpage); } free_page((unsigned long)cpu_buffer->free_page); kfree(cpu_buffer); } static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, struct lock_class_key *key) { struct trace_buffer *buffer; long nr_pages; int subbuf_size; int bsize; int cpu; int ret; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); if (!buffer) return NULL; if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; buffer->subbuf_order = order; subbuf_size = (PAGE_SIZE << order); buffer->subbuf_size = subbuf_size - BUF_PAGE_HDR_SIZE; /* Max payload is buffer page size - header (8bytes) */ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&buffer->irq_work.waiters); buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), GFP_KERNEL); if (!buffer->buffers) goto fail_free_cpumask; /* If start/end are specified, then that overrides size */ if (start && end) { unsigned long ptr; int n; size = end - start; size = size / nr_cpu_ids; /* * The number of sub-buffers (nr_pages) is determined by the * total size allocated minus the meta data size. * Then that is divided by the number of per CPU buffers * needed, plus account for the integer array index that * will be appended to the meta data. */ nr_pages = (size - sizeof(struct ring_buffer_meta)) / (subbuf_size + sizeof(int)); /* Need at least two pages plus the reader page */ if (nr_pages < 3) goto fail_free_buffers; again: /* Make sure that the size fits aligned */ for (n = 0, ptr = start; n < nr_cpu_ids; n++) { ptr += sizeof(struct ring_buffer_meta) + sizeof(int) * nr_pages; ptr = ALIGN(ptr, subbuf_size); ptr += subbuf_size * nr_pages; } if (ptr > end) { if (nr_pages <= 3) goto fail_free_buffers; nr_pages--; goto again; } /* nr_pages should not count the reader page */ nr_pages--; buffer->range_addr_start = start; buffer->range_addr_end = end; rb_range_meta_init(buffer, nr_pages); } else { /* need at least two pages */ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); if (nr_pages < 2) nr_pages = 2; } cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) goto fail_free_buffers; ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); if (ret < 0) goto fail_free_buffers; mutex_init(&buffer->mutex); return buffer; fail_free_buffers: for_each_buffer_cpu(buffer, cpu) { if (buffer->buffers[cpu]) rb_free_cpu_buffer(buffer->buffers[cpu]); } kfree(buffer->buffers); fail_free_cpumask: free_cpumask_var(buffer->cpumask); fail_free_buffer: kfree(buffer); return NULL; } /** * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE * flag. This flag means that the buffer will overwrite old data * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ return alloc_buffer(size, flags, 0, 0, 0,key); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); /** * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * @order: sub-buffer order * @start: start of allocated range * @range_size: size of allocated range * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE * flag. This flag means that the buffer will overwrite old data * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long range_size, struct lock_class_key *key) { return alloc_buffer(size, flags, order, start, start + range_size, key); } /** * ring_buffer_last_boot_delta - return the delta offset from last boot * @buffer: The buffer to return the delta from * @text: Return text delta * @data: Return data delta * * Returns: The true if the delta is non zero */ bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, long *data) { if (!buffer) return false; if (!buffer->last_text_delta) return false; *text = buffer->last_text_delta; *data = buffer->last_data_delta; return true; } /** * ring_buffer_free - free a ring buffer. * @buffer: the buffer to free. */ void ring_buffer_free(struct trace_buffer *buffer) { int cpu; cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); } EXPORT_SYMBOL_GPL(ring_buffer_free); void ring_buffer_set_clock(struct trace_buffer *buffer, u64 (*clock)(void)) { buffer->clock = clock; } void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs) { buffer->time_stamp_abs = abs; } bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) { return buffer->time_stamp_abs; } static inline unsigned long rb_page_entries(struct buffer_page *bpage) { return local_read(&bpage->entries) & RB_WRITE_MASK; } static inline unsigned long rb_page_write(struct buffer_page *bpage) { return local_read(&bpage->write) & RB_WRITE_MASK; } static bool rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { struct list_head *tail_page, *to_remove, *next_page; struct buffer_page *to_remove_page, *tmp_iter_page; struct buffer_page *last_page, *first_page; unsigned long nr_removed; unsigned long head_bit; int page_entries; head_bit = 0; raw_spin_lock_irq(&cpu_buffer->reader_lock); atomic_inc(&cpu_buffer->record_disabled); /* * We don't race with the readers since we have acquired the reader * lock. We also don't race with writers after disabling recording. * This makes it easy to figure out the first and the last page to be * removed from the list. We unlink all the pages in between including * the first and last pages. This is done in a busy loop so that we * lose the least number of traces. * The pages are freed after we restart recording and unlock readers. */ tail_page = &cpu_buffer->tail_page->list; /* * tail page might be on reader page, we remove the next page * from the ring buffer */ if (cpu_buffer->tail_page == cpu_buffer->reader_page) tail_page = rb_list_head(tail_page->next); to_remove = tail_page; /* start of pages to remove */ first_page = list_entry(rb_list_head(to_remove->next), struct buffer_page, list); for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { to_remove = rb_list_head(to_remove)->next; head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } /* Read iterators need to reset themselves when some pages removed */ cpu_buffer->pages_removed += nr_removed; next_page = rb_list_head(to_remove)->next; /* * Now we remove all pages between tail_page and next_page. * Make sure that we have head_bit value preserved for the * next page */ tail_page->next = (struct list_head *)((unsigned long)next_page | head_bit); next_page = rb_list_head(next_page); next_page->prev = tail_page; /* make sure pages points to a valid page in the ring buffer */ cpu_buffer->pages = next_page; cpu_buffer->cnt++; /* update head page */ if (head_bit) cpu_buffer->head_page = list_entry(next_page, struct buffer_page, list); /* pages are removed, resume tracing and then free the pages */ atomic_dec(&cpu_buffer->record_disabled); raw_spin_unlock_irq(&cpu_buffer->reader_lock); RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); /* last buffer page to remove */ last_page = list_entry(rb_list_head(to_remove), struct buffer_page, list); tmp_iter_page = first_page; do { cond_resched(); to_remove_page = tmp_iter_page; rb_inc_page(&tmp_iter_page); /* update the counters */ page_entries = rb_page_entries(to_remove_page); if (page_entries) { /* * If something was added to this page, it was full * since it is not the tail page. So we deduct the * bytes consumed in ring buffer from here. * Increment overrun to account for the lost events. */ local_add(page_entries, &cpu_buffer->overrun); local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); } /* * We have already removed references to this list item, just * free up the buffer_page and its page */ free_buffer_page(to_remove_page); nr_removed--; } while (to_remove_page != last_page); RB_WARN_ON(cpu_buffer, nr_removed); return nr_removed == 0; } static bool rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *pages = &cpu_buffer->new_pages; unsigned long flags; bool success; int retries; /* Can be called at early boot up, where interrupts must not been enabled */ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* * We are holding the reader lock, so the reader page won't be swapped * in the ring buffer. Now we are racing with the writer trying to * move head page and the tail page. * We are going to adapt the reader page update process where: * 1. We first splice the start and end of list of new pages between * the head page and its previous page. * 2. We cmpxchg the prev_page->next to point from head page to the * start of new pages list. * 3. Finally, we update the head->prev to the end of new list. * * We will try this process 10 times, to make sure that we don't keep * spinning. */ retries = 10; success = false; while (retries--) { struct list_head *head_page, *prev_page; struct list_head *last_page, *first_page; struct list_head *head_page_with_bit; struct buffer_page *hpage = rb_set_head_page(cpu_buffer); if (!hpage) break; head_page = &hpage->list; prev_page = head_page->prev; first_page = pages->next; last_page = pages->prev; head_page_with_bit = (struct list_head *) ((unsigned long)head_page | RB_PAGE_HEAD); last_page->next = head_page_with_bit; first_page->prev = prev_page; /* caution: head_page_with_bit gets updated on cmpxchg failure */ if (try_cmpxchg(&prev_page->next, &head_page_with_bit, first_page)) { /* * yay, we replaced the page pointer to our new list, * now, we just have to update to head page's prev * pointer to point to end of list */ head_page->prev = last_page; cpu_buffer->cnt++; success = true; break; } } if (success) INIT_LIST_HEAD(pages); /* * If we weren't successful in adding in new pages, warn and stop * tracing */ RB_WARN_ON(cpu_buffer, !success); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); /* free pages if they weren't inserted */ if (!success) { struct buffer_page *bpage, *tmp; list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } } return success; } static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) { bool success; if (cpu_buffer->nr_pages_to_update > 0) success = rb_insert_pages(cpu_buffer); else success = rb_remove_pages(cpu_buffer, -cpu_buffer->nr_pages_to_update); if (success) cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; } static void update_pages_handler(struct work_struct *work) { struct ring_buffer_per_cpu *cpu_buffer = container_of(work, struct ring_buffer_per_cpu, update_pages_work); rb_update_pages(cpu_buffer); complete(&cpu_buffer->update_done); } /** * ring_buffer_resize - resize the ring buffer * @buffer: the buffer to resize. * @size: the new size. * @cpu_id: the cpu buffer to resize * * Minimum size is 2 * buffer->subbuf_size. * * Returns 0 on success and < 0 on failure. */ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long nr_pages; int cpu, err; /* * Always succeed at resizing a non-existent buffer: */ if (!buffer) return 0; /* Make sure the requested buffer exists */ if (cpu_id != RING_BUFFER_ALL_CPUS && !cpumask_test_cpu(cpu_id, buffer->cpumask)) return 0; nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); /* we need a minimum of two pages */ if (nr_pages < 2) nr_pages = 2; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); if (cpu_id == RING_BUFFER_ALL_CPUS) { /* * Don't succeed if resizing is disabled, as a reader might be * manipulating the ring buffer and is expecting a sane state while * this is true. */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->resize_disabled)) { err = -EBUSY; goto out_err_unlock; } } /* calculate the pages to update */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; cpu_buffer->nr_pages_to_update = nr_pages - cpu_buffer->nr_pages; /* * nothing more to do for removing pages or no update */ if (cpu_buffer->nr_pages_to_update <= 0) continue; /* * to add pages, make sure all new pages can be * allocated without receiving ENOMEM */ INIT_LIST_HEAD(&cpu_buffer->new_pages); if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, &cpu_buffer->new_pages)) { /* not enough memory for new pages */ err = -ENOMEM; goto out_err; } cond_resched(); } cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary * since we can change their buffer sizes without any race. */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!cpu_buffer->nr_pages_to_update) continue; /* Can't run something on an offline CPU. */ if (!cpu_online(cpu)) { rb_update_pages(cpu_buffer); cpu_buffer->nr_pages_to_update = 0; } else { /* Run directly if possible. */ migrate_disable(); if (cpu != smp_processor_id()) { migrate_enable(); schedule_work_on(cpu, &cpu_buffer->update_pages_work); } else { update_pages_handler(&cpu_buffer->update_pages_work); migrate_enable(); } } } /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!cpu_buffer->nr_pages_to_update) continue; if (cpu_online(cpu)) wait_for_completion(&cpu_buffer->update_done); cpu_buffer->nr_pages_to_update = 0; } cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; if (nr_pages == cpu_buffer->nr_pages) goto out; /* * Don't succeed if resizing is disabled, as a reader might be * manipulating the ring buffer and is expecting a sane state while * this is true. */ if (atomic_read(&cpu_buffer->resize_disabled)) { err = -EBUSY; goto out_err_unlock; } cpu_buffer->nr_pages_to_update = nr_pages - cpu_buffer->nr_pages; INIT_LIST_HEAD(&cpu_buffer->new_pages); if (cpu_buffer->nr_pages_to_update > 0 && __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, &cpu_buffer->new_pages)) { err = -ENOMEM; goto out_err; } cpus_read_lock(); /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); else { /* Run directly if possible. */ migrate_disable(); if (cpu_id == smp_processor_id()) { rb_update_pages(cpu_buffer); migrate_enable(); } else { migrate_enable(); schedule_work_on(cpu_id, &cpu_buffer->update_pages_work); wait_for_completion(&cpu_buffer->update_done); } } cpu_buffer->nr_pages_to_update = 0; cpus_read_unlock(); } out: /* * The ring buffer resize can happen with the ring buffer * enabled, so that the update disturbs the tracing as little * as possible. But if the buffer is disabled, we do not need * to worry about that, and we can take the time to verify * that the buffer is not corrupt. */ if (atomic_read(&buffer->record_disabled)) { atomic_inc(&buffer->record_disabled); /* * Even though the buffer was disabled, we must make sure * that it is truly disabled before calling rb_check_pages. * There could have been a race between checking * record_disable and incrementing it. */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; rb_check_pages(cpu_buffer); } atomic_dec(&buffer->record_disabled); } atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return 0; out_err: for_each_buffer_cpu(buffer, cpu) { struct buffer_page *bpage, *tmp; cpu_buffer = buffer->buffers[cpu]; cpu_buffer->nr_pages_to_update = 0; if (list_empty(&cpu_buffer->new_pages)) continue; list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } } out_err_unlock: atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return err; } EXPORT_SYMBOL_GPL(ring_buffer_resize); void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val) { mutex_lock(&buffer->mutex); if (val) buffer->flags |= RB_FL_OVERWRITE; else buffer->flags &= ~RB_FL_OVERWRITE; mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; } static __always_inline struct ring_buffer_event * rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) { return __rb_page_index(cpu_buffer->reader_page, cpu_buffer->reader_page->read); } static struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { struct ring_buffer_event *event; struct buffer_page *iter_head_page = iter->head_page; unsigned long commit; unsigned length; if (iter->head != iter->next_event) return iter->event; /* * When the writer goes across pages, it issues a cmpxchg which * is a mb(), which will synchronize with the rmb here. * (see rb_tail_page_update() and __rb_reserve_next()) */ commit = rb_page_commit(iter_head_page); smp_rmb(); /* An event needs to be at least 8 bytes in size */ if (iter->head > commit - 8) goto reset; event = __rb_page_index(iter_head_page, iter->head); length = rb_event_length(event); /* * READ_ONCE() doesn't work on functions and we don't want the * compiler doing any crazy optimizations with length. */ barrier(); if ((iter->head + length) > commit || length > iter->event_size) /* Writer corrupted the read? */ goto reset; memcpy(iter->event, event, length); /* * If the page stamp is still the same after this rmb() then the * event was safely copied without the writer entering the page. */ smp_rmb(); /* Make sure the page didn't change since we read this */ if (iter->page_stamp != iter_head_page->page->time_stamp || commit > rb_page_commit(iter_head_page)) goto reset; iter->next_event = iter->head + length; return iter->event; reset: /* Reset to the beginning */ iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; iter->next_event = 0; iter->missed_events = 1; return NULL; } /* Size is determined by what has been committed */ static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { return rb_page_commit(bpage) & ~RB_MISSED_MASK; } static __always_inline unsigned rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) { return rb_page_commit(cpu_buffer->commit_page); } static __always_inline unsigned rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1; return addr - BUF_PAGE_HDR_SIZE; } static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* * The iterator could be on the reader page (it starts there). * But the head could have moved, since the reader was * found. Check for this case and assign the iterator * to the head page instead of next. */ if (iter->head_page == cpu_buffer->reader_page) iter->head_page = rb_set_head_page(cpu_buffer); else rb_inc_page(&iter->head_page); iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; iter->next_event = 0; } /* Return the index into the sub-buffers for a given sub-buffer */ static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) { void *subbuf_array; subbuf_array = (void *)meta + sizeof(int) * meta->nr_subbufs; subbuf_array = (void *)ALIGN((unsigned long)subbuf_array, meta->subbuf_size); return (subbuf - subbuf_array) / meta->subbuf_size; } static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; unsigned long old_head = (unsigned long)next_page->page; unsigned long new_head; rb_inc_page(&next_page); new_head = (unsigned long)next_page->page; /* * Only move it forward once, if something else came in and * moved it forward, then we don't want to touch it. */ (void)cmpxchg(&meta->head_buffer, old_head, new_head); } static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *reader) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; void *old_reader = cpu_buffer->reader_page->page; void *new_reader = reader->page; int id; id = reader->id; cpu_buffer->reader_page->id = id; reader->id = 0; meta->buffers[0] = rb_meta_subbuf_idx(meta, new_reader); meta->buffers[id] = rb_meta_subbuf_idx(meta, old_reader); /* The head pointer is the one after the reader */ rb_update_meta_head(cpu_buffer, reader); } /* * rb_handle_head_page - writer hit the head page * * Returns: +1 to retry page * 0 to continue * -1 on error */ static int rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { struct buffer_page *new_head; int entries; int type; int ret; entries = rb_page_entries(next_page); /* * The hard part is here. We need to move the head * forward, and protect against both readers on * other CPUs and writers coming in via interrupts. */ type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, RB_PAGE_HEAD); /* * type can be one of four: * NORMAL - an interrupt already moved it for us * HEAD - we are the first to get here. * UPDATE - we are the interrupt interrupting * a current move. * MOVED - a reader on another CPU moved the next * pointer to its reader page. Give up * and try again. */ switch (type) { case RB_PAGE_HEAD: /* * We changed the head to UPDATE, thus * it is our responsibility to update * the counters. */ local_add(entries, &cpu_buffer->overrun); local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); if (cpu_buffer->ring_meta) rb_update_meta_head(cpu_buffer, next_page); /* * The entries will be zeroed out when we move the * tail page. */ /* still more to do */ break; case RB_PAGE_UPDATE: /* * This is an interrupt that interrupt the * previous update. Still more to do. */ break; case RB_PAGE_NORMAL: /* * An interrupt came in before the update * and processed this for us. * Nothing left to do. */ return 1; case RB_PAGE_MOVED: /* * The reader is on another CPU and just did * a swap with our next_page. * Try again. */ return 1; default: RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ return -1; } /* * Now that we are here, the old head pointer is * set to UPDATE. This will keep the reader from * swapping the head page with the reader page. * The reader (on another CPU) will spin till * we are finished. * * We just need to protect against interrupts * doing the job. We will set the next pointer * to HEAD. After that, we set the old pointer * to NORMAL, but only if it was HEAD before. * otherwise we are an interrupt, and only * want the outer most commit to reset it. */ new_head = next_page; rb_inc_page(&new_head); ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, RB_PAGE_NORMAL); /* * Valid returns are: * HEAD - an interrupt came in and already set it. * NORMAL - One of two things: * 1) We really set it. * 2) A bunch of interrupts came in and moved * the page forward again. */ switch (ret) { case RB_PAGE_HEAD: case RB_PAGE_NORMAL: /* OK */ break; default: RB_WARN_ON(cpu_buffer, 1); return -1; } /* * It is possible that an interrupt came in, * set the head up, then more interrupts came in * and moved it again. When we get back here, * the page would have been set to NORMAL but we * just set it back to HEAD. * * How do you detect this? Well, if that happened * the tail page would have moved. */ if (ret == RB_PAGE_NORMAL) { struct buffer_page *buffer_tail_page; buffer_tail_page = READ_ONCE(cpu_buffer->tail_page); /* * If the tail had moved passed next, then we need * to reset the pointer. */ if (buffer_tail_page != tail_page && buffer_tail_page != next_page) rb_head_page_set_normal(cpu_buffer, new_head, next_page, RB_PAGE_HEAD); } /* * If this was the outer most commit (the one that * changed the original pointer from HEAD to UPDATE), * then it is up to us to reset it to NORMAL. */ if (type == RB_PAGE_HEAD) { ret = rb_head_page_set_normal(cpu_buffer, next_page, tail_page, RB_PAGE_UPDATE); if (RB_WARN_ON(cpu_buffer, ret != RB_PAGE_UPDATE)) return -1; } return 0; } static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; unsigned long length = info->length; /* * Only the event that crossed the page boundary * must fill the old tail_page with padding. */ if (tail >= bsize) { /* * If the page was filled, then we still need * to update the real_end. Reset it to zero * and the reader will ignore it. */ if (tail == bsize) tail_page->real_end = 0; local_sub(length, &tail_page->write); return; } event = __rb_page_index(tail_page, tail); /* * Save the original length to the meta data. * This will be used by the reader to add lost event * counter. */ tail_page->real_end = tail; /* * If this event is bigger than the minimum size, then * we need to be careful that we don't subtract the * write counter enough to allow another writer to slip * in on this page. * We put in a discarded commit instead, to make sure * that this space is not used again, and this space will * not be accounted into 'entries_bytes'. * * If we are less than the minimum size, we don't need to * worry about it. */ if (tail > (bsize - RB_EVNT_MIN_SIZE)) { /* No room for any events */ /* Mark the rest of the page with padding */ rb_event_set_padding(event); /* Make sure the padding is visible before the write update */ smp_wmb(); /* Set the write back to the previous setting */ local_sub(length, &tail_page->write); return; } /* Put in a discarded event */ event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; /* account for padding bytes */ local_add(bsize - tail, &cpu_buffer->entries_bytes); /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); /* Set write to end of buffer */ length = (tail + length) - bsize; local_sub(length, &tail_page->write); } static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); /* * This is the slow path, force gcc not to inline it. */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct trace_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; next_page = tail_page; rb_inc_page(&next_page); /* * If for some reason, we had an interrupt storm that made * it all the way around the buffer, bail, and warn * about it. */ if (unlikely(next_page == commit_page)) { local_inc(&cpu_buffer->commit_overrun); goto out_reset; } /* * This is where the fun begins! * * We are fighting against races between a reader that * could be on another CPU trying to swap its reader * page with the buffer head. * * We are also fighting against interrupts coming in and * moving the head or tail on us as well. * * If the next page is the head page then we have filled * the buffer, unless the commit page is still on the * reader page. */ if (rb_is_head_page(next_page, &tail_page->list)) { /* * If the commit is not on the reader page, then * move the header page. */ if (!rb_is_reader_page(cpu_buffer->commit_page)) { /* * If we are not in overwrite mode, * this is easy, just stop here. */ if (!(buffer->flags & RB_FL_OVERWRITE)) { local_inc(&cpu_buffer->dropped_events); goto out_reset; } ret = rb_handle_head_page(cpu_buffer, tail_page, next_page); if (ret < 0) goto out_reset; if (ret) goto out_again; } else { /* * We need to be careful here too. The * commit page could still be on the reader * page. We could have a small buffer, and * have filled up the buffer with events * from interrupts and such, and wrapped. * * Note, if the tail page is also on the * reader_page, we let it move out. */ if (unlikely((cpu_buffer->commit_page != cpu_buffer->tail_page) && (cpu_buffer->commit_page == cpu_buffer->reader_page))) { local_inc(&cpu_buffer->commit_overrun); goto out_reset; } } } rb_tail_page_update(cpu_buffer, tail_page, next_page); out_again: rb_reset_tail(cpu_buffer, tail, info); /* Commit what we have for now. */ rb_end_commit(cpu_buffer); /* rb_end_commit() decs committing */ local_inc(&cpu_buffer->committing); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ rb_reset_tail(cpu_buffer, tail, info); return NULL; } /* Slow path */ static struct ring_buffer_event * rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) event->type_len = RINGBUF_TYPE_TIME_STAMP; else event->type_len = RINGBUF_TYPE_TIME_EXTEND; /* Not the first event on the page, or not delta? */ if (abs || rb_event_index(cpu_buffer, event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { /* nope, just zero it */ event->time_delta = 0; event->array[0] = 0; } return skip_time_extend(event); } #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline bool sched_clock_stable(void) { return true; } #endif static void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { u64 write_stamp; WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", (unsigned long long)info->delta, (unsigned long long)info->ts, (unsigned long long)info->before, (unsigned long long)info->after, (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/tracing/trace_clock\n" "or add trace_clock=global to the kernel command line\n"); } static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event **event, struct rb_event_info *info, u64 *delta, unsigned int *length) { bool abs = info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); if (unlikely(info->delta > (1ULL << 59))) { /* * Some timers can use more than 59 bits, and when a timestamp * is added to the buffer, it will lose those bits. */ if (abs && (info->ts & TS_MSB)) { info->delta &= ABS_TS_MASK; /* did the clock go backwards */ } else if (info->before == info->after && info->before > info->ts) { /* not interrupted */ static int once; /* * This is possible with a recalibrating of the TSC. * Do not produce a call stack, but just report it. */ if (!once) { once++; pr_warn("Ring buffer clock went backwards: %llu -> %llu\n", info->before, info->ts); } } else rb_check_timestamp(cpu_buffer, info); if (!abs) info->delta = 0; } *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs); *length -= RB_LEN_TIME_EXTEND; *delta = 0; } /** * rb_update_event - update event type and data * @cpu_buffer: The per cpu buffer of the @event * @event: the event to update * @info: The info to update the @event with (contains length and delta) * * Update the type and data fields of the @event. The length * is the actual size that is written to the ring buffer, * and with this, we can determine what to place into the * data field. */ static void rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event, struct rb_event_info *info) { unsigned length = info->length; u64 delta = info->delta; unsigned int nest = local_read(&cpu_buffer->committing) - 1; if (!WARN_ON_ONCE(nest >= MAX_NEST)) cpu_buffer->event_stamp[nest] = info->ts; /* * If we need to add a timestamp, then we * add it to the start of the reserved space. */ if (unlikely(info->add_timestamp)) rb_add_timestamp(cpu_buffer, &event, info, &delta, &length); event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { event->type_len = 0; event->array[0] = length; } else event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); } static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ /* zero length can cause confusions */ if (!length) length++; if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; length = ALIGN(length, RB_ARCH_ALIGNMENT); /* * In case the time delta is larger than the 27 bits for it * in the header, we need to add a timestamp. If another * event comes in when trying to discard this one to increase * the length, then the timestamp will be added in the allocated * space of this event. If length is bigger than the size needed * for the TIME_EXTEND, then padding has to be used. The events * length must be either RB_LEN_TIME_EXTEND, or greater than or equal * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. * As length is a multiple of 4, we only need to worry if it * is 12 (RB_LEN_TIME_EXTEND + 4). */ if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) length += RB_ALIGNMENT; return length; } static inline bool rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long new_index, old_index; struct buffer_page *bpage; unsigned long addr; new_index = rb_event_index(cpu_buffer, event); old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); bpage = READ_ONCE(cpu_buffer->tail_page); /* * Make sure the tail_page is still the same and * the next write location is the end of this event */ if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); /* * For the before_stamp to be different than the write_stamp * to make sure that the next event adds an absolute * value and does not rely on the saved write stamp, which * is now going to be bogus. * * By setting the before_stamp to zero, the next event * is not going to use the write_stamp and will instead * create an absolute timestamp. This means there's no * reason to update the wirte_stamp! */ rb_time_set(&cpu_buffer->before_stamp, 0); /* * If an event were to come in now, it would see that the * write_stamp and the before_stamp are different, and assume * that this event just added itself before updating * the write stamp. The interrupting event will fix the * write stamp for us, and use an absolute timestamp. */ /* * This is on the tail page. It is possible that * a write could come in and move the tail page * and write to the next page. That is fine * because we just shorten what is on this page. */ old_index += write_mask; new_index += write_mask; /* caution: old_index gets updated on cmpxchg failure */ if (local_try_cmpxchg(&bpage->write, &old_index, new_index)) { /* update counters */ local_sub(event_length, &cpu_buffer->entries_bytes); return true; } } /* could not discard */ return false; } static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->committing); local_inc(&cpu_buffer->commits); } static __always_inline void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long max_count; /* * We only race with interrupts and NMIs on this CPU. * If we own the commit event, then we can commit * all others that interrupted us, since the interruptions * are in stack format (they finish before they come * back to us). This allows us to do a simple loop to * assign the commit to the tail. */ again: max_count = cpu_buffer->nr_pages * 100; while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) { if (RB_WARN_ON(cpu_buffer, !(--max_count))) return; if (RB_WARN_ON(cpu_buffer, rb_is_reader_page(cpu_buffer->tail_page))) return; /* * No need for a memory barrier here, as the update * of the tail_page did it for this page. */ local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); if (cpu_buffer->ring_meta) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page; } /* add barrier to keep gcc from optimizing too much */ barrier(); } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { /* Make sure the readers see the content of what is committed. */ smp_wmb(); local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->commit_page->page->commit) & ~RB_WRITE_MASK); barrier(); } /* again, keep gcc from optimizing */ barrier(); /* * If an interrupt came in just after the first while loop * and pushed the tail page forward, we will be left with * a dangling commit that will never go forward. */ if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page))) goto again; } static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; if (RB_WARN_ON(cpu_buffer, !local_read(&cpu_buffer->committing))) return; again: commits = local_read(&cpu_buffer->commits); /* synchronize with interrupts */ barrier(); if (local_read(&cpu_buffer->committing) == 1) rb_set_commit_to_write(cpu_buffer); local_dec(&cpu_buffer->committing); /* synchronize with interrupts */ barrier(); /* * Need to account for interrupts coming in between the * updating of the commit page and the clearing of the * committing counter. */ if (unlikely(local_read(&cpu_buffer->commits) != commits) && !local_read(&cpu_buffer->committing)) { local_inc(&cpu_buffer->committing); goto again; } } static inline void rb_event_discard(struct ring_buffer_event *event) { if (extended_time(event)) event = skip_time_extend(event); /* array[0] holds the actual length for the discarded event */ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ if (!event->time_delta) event->time_delta = 1; } static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->entries); rb_end_commit(cpu_buffer); } static __always_inline void rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&buffer->irq_work.work); } if (cpu_buffer->irq_work.waiters_pending) { cpu_buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&cpu_buffer->irq_work.work); } if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched)) return; if (cpu_buffer->reader_page == cpu_buffer->commit_page) return; if (!cpu_buffer->irq_work.full_waiters_pending) return; cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched); if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full)) return; cpu_buffer->irq_work.wakeup_full = true; cpu_buffer->irq_work.full_waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&cpu_buffer->irq_work.work); } #ifdef CONFIG_RING_BUFFER_RECORD_RECURSION # define do_ring_buffer_record_recursion() \ do_ftrace_record_recursion(_THIS_IP_, _RET_IP_) #else # define do_ring_buffer_record_recursion() do { } while (0) #endif /* * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can * be modified more than once via an interrupt. To pass this * information from the lock to the unlock without having to * access the 'in_interrupt()' functions again (which do show * a bit of overhead in something as critical as function tracing, * we use a bitmask trick. * * bit 1 = NMI context * bit 2 = IRQ context * bit 3 = SoftIRQ context * bit 4 = normal context. * * This works because this is the order of contexts that can * preempt other contexts. A SoftIRQ never preempts an IRQ * context. * * When the context is determined, the corresponding bit is * checked and set (if it was set, then a recursion of that context * happened). * * On unlock, we need to clear this bit. To do so, just subtract * 1 from the current_context and AND it to itself. * * (binary) * 101 - 1 = 100 * 101 & 100 = 100 (clearing bit zero) * * 1010 - 1 = 1001 * 1010 & 1001 = 1000 (clearing bit 1) * * The least significant bit can be cleared this way, and it * just so happens that it is the same bit corresponding to * the current context. * * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit * is set when a recursion is detected at the current context, and if * the TRANSITION bit is already set, it will fail the recursion. * This is needed because there's a lag between the changing of * interrupt context and updating the preempt count. In this case, * a false positive will be found. To handle this, one extra recursion * is allowed, and this is done by the TRANSITION bit. If the TRANSITION * bit is already set, then it is considered a recursion and the function * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. * * On the trace_recursive_unlock(), the TRANSITION bit will be the first * to be cleared. Even if it wasn't the context that set it. That is, * if an interrupt comes in while NORMAL bit is set and the ring buffer * is called before preempt_count() is updated, since the check will * be on the NORMAL bit, the TRANSITION bit will then be set. If an * NMI then comes in, it will set the NMI bit, but when the NMI code * does the trace_recursive_unlock() it will clear the TRANSITION bit * and leave the NMI bit set. But this is fine, because the interrupt * code that set the TRANSITION bit will then clear the NMI bit when it * calls trace_recursive_unlock(). If another NMI comes in, it will * set the TRANSITION bit and continue. * * Note: The TRANSITION bit only handles a single transition between context. */ static __always_inline bool trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; int bit = interrupt_context_level(); bit = RB_CTX_NORMAL - bit; if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { /* * It is possible that this was called by transitioning * between interrupt context, and preempt_count() has not * been updated yet. In this case, use the TRANSITION bit. */ bit = RB_CTX_TRANSITION; if (val & (1 << (bit + cpu_buffer->nest))) { do_ring_buffer_record_recursion(); return true; } } val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; return false; } static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->current_context &= cpu_buffer->current_context - (1 << cpu_buffer->nest); } /* The recursive locking above uses 5 bits */ #define NESTED_BITS 5 /** * ring_buffer_nest_start - Allow to trace while nested * @buffer: The ring buffer to modify * * The ring buffer has a safety mechanism to prevent recursion. * But there may be a case where a trace needs to be done while * tracing something else. In this case, calling this function * will allow this function to nest within a currently active * ring_buffer_lock_reserve(). * * Call this function before calling another ring_buffer_lock_reserve() and * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit(). */ void ring_buffer_nest_start(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* Enabled by ring_buffer_nest_end() */ preempt_disable_notrace(); cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* This is the shift value for the above recursive locking */ cpu_buffer->nest += NESTED_BITS; } /** * ring_buffer_nest_end - Allow to trace while nested * @buffer: The ring buffer to modify * * Must be called after ring_buffer_nest_start() and after the * ring_buffer_unlock_commit(). */ void ring_buffer_nest_end(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* disabled by ring_buffer_nest_start() */ cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* This is the shift value for the above recursive locking */ cpu_buffer->nest -= NESTED_BITS; preempt_enable_notrace(); } /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to * * This commits the data to the ring buffer, and releases any locks held. * * Must be paired with ring_buffer_lock_reserve. */ int ring_buffer_unlock_commit(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); return 0; } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); /* Special value to validate all deltas on a page. */ #define CHECK_FULL_PAGE 1L #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS static const char *show_irq_str(int bits) { const char *type[] = { ".", // 0 "s", // 1 "h", // 2 "Hs", // 3 "n", // 4 "Ns", // 5 "Nh", // 6 "NHs", // 7 }; return type[bits]; } /* Assume this is a trace event */ static const char *show_flags(struct ring_buffer_event *event) { struct trace_entry *entry; int bits = 0; if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) return "X"; entry = ring_buffer_event_data(event); if (entry->flags & TRACE_FLAG_SOFTIRQ) bits |= 1; if (entry->flags & TRACE_FLAG_HARDIRQ) bits |= 2; if (entry->flags & TRACE_FLAG_NMI) bits |= 4; return show_irq_str(bits); } static const char *show_irq(struct ring_buffer_event *event) { struct trace_entry *entry; if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) return ""; entry = ring_buffer_event_data(event); if (entry->flags & TRACE_FLAG_IRQS_OFF) return "d"; return ""; } static const char *show_interrupt_level(void) { unsigned long pc = preempt_count(); unsigned char level = 0; if (pc & SOFTIRQ_OFFSET) level |= 1; if (pc & HARDIRQ_MASK) level |= 2; if (pc & NMI_MASK) level |= 4; return show_irq_str(level); } static void dump_buffer_page(struct buffer_data_page *bpage, struct rb_event_info *info, unsigned long tail) { struct ring_buffer_event *event; u64 ts, delta; int e; ts = bpage->time_stamp; pr_warn(" [%lld] PAGE TIME STAMP\n", ts); for (e = 0; e < tail; e += rb_event_length(event)) { event = (struct ring_buffer_event *)(bpage->data + e); switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n", e, ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); ts = rb_fix_abs_ts(delta, ts); pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n", e, ts, delta); break; case RINGBUF_TYPE_PADDING: ts += event->time_delta; pr_warn(" 0x%x: [%lld] delta:%d PADDING\n", e, ts, event->time_delta); break; case RINGBUF_TYPE_DATA: ts += event->time_delta; pr_warn(" 0x%x: [%lld] delta:%d %s%s\n", e, ts, event->time_delta, show_flags(event), show_irq(event)); break; default: break; } } pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e); } static DEFINE_PER_CPU(atomic_t, checking); static atomic_t ts_dump; #define buffer_warn_return(fmt, ...) \ do { \ /* If another report is happening, ignore this one */ \ if (atomic_inc_return(&ts_dump) != 1) { \ atomic_dec(&ts_dump); \ goto out; \ } \ atomic_inc(&cpu_buffer->record_disabled); \ pr_warn(fmt, ##__VA_ARGS__); \ dump_buffer_page(bpage, info, tail); \ atomic_dec(&ts_dump); \ /* There's some cases in boot up that this can happen */ \ if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \ /* Do not re-enable checking */ \ return; \ } while (0) /* * Check if the current event time stamp matches the deltas on * the buffer page. */ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info, unsigned long tail) { struct buffer_data_page *bpage; u64 ts, delta; bool full = false; int ret; bpage = info->tail_page->page; if (tail == CHECK_FULL_PAGE) { full = true; tail = local_read(&bpage->commit); } else if (info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) { /* Ignore events with absolute time stamps */ return; } /* * Do not check the first event (skip possible extends too). * Also do not check if previous events have not been committed. */ if (tail <= 8 || tail > local_read(&bpage->commit)) return; /* * If this interrupted another event, */ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta); if (ret < 0) { if (delta < ts) { buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", cpu_buffer->cpu, ts, delta); goto out; } } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", cpu_buffer->cpu, ts + info->delta, info->ts, info->delta, info->before, info->after, full ? " (full)" : "", show_interrupt_level()); } out: atomic_dec(this_cpu_ptr(&checking)); } #else static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info, unsigned long tail) { } #endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); rb_time_read(&cpu_buffer->before_stamp, &info->before); rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) { info->delta = info->ts; } else { /* * If interrupting an event time update, we may need an * absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ if (!w) { /* Use the sub-buffer timestamp */ info->delta = 0; } else if (unlikely(info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { info->delta = info->ts - info->after; if (unlikely(test_time_stamp(info->delta))) { info->add_timestamp |= RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } } } /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts); /*C*/ write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ write &= RB_WRITE_MASK; tail = write - info->length; /* See if we shot pass the end of this buffer page */ if (unlikely(write > cpu_buffer->buffer->subbuf_size)) { check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } if (likely(tail == w)) { /* Nothing interrupted us between A and C */ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); /* * If something came in between C and D, the write stamp * may now not be in sync. But that's fine as the before_stamp * will be different and then next event will just be forced * to use an absolute timestamp. */ if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ info->delta = info->ts - info->after; else /* Just use full timestamp for interrupting event */ info->delta = info->ts; check_buffer(cpu_buffer, info, tail); } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ /* Save the old before_stamp */ rb_time_read(&cpu_buffer->before_stamp, &info->before); /* * Read a new timestamp and update the before_stamp to make * the next event after this one force using an absolute * timestamp. This is in case an interrupt were to come in * between E and F. */ ts = rb_time_stamp(cpu_buffer->buffer); rb_time_set(&cpu_buffer->before_stamp, ts); barrier(); /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && info->after == info->before && info->after < ts) { /* * Nothing came after this event between C and F, it is * safe to use info->after for the delta as it * matched info->before and is still valid. */ info->delta = ts - info->after; } else { /* * Interrupted between C and F: * Lost the previous events time stamp. Just set the * delta to zero, and this will be the same time as * the event this event interrupted. And the events that * came after this will still be correct (as they would * have built their delta on the previous event. */ info->delta = 0; } info->ts = ts; info->add_timestamp &= ~RB_ADD_STAMP_FORCE; } /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ if (unlikely(!tail && !(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) info->delta = 0; /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); /* * If this is the first commit on the page, then update * its timestamp. */ if (unlikely(!tail)) tail_page->page->time_stamp = info->ts; /* account for these added bytes */ local_add(info->length, &cpu_buffer->entries_bytes); return event; } static __always_inline struct ring_buffer_event * rb_reserve_next_event(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; int add_ts_default; /* * ring buffer does cmpxchg as well as atomic64 operations * (which some archs use locking for atomic64), make sure this * is safe in NMI context */ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || IS_ENABLED(CONFIG_GENERIC_ATOMIC64)) && (unlikely(in_nmi()))) { return NULL; } rb_start_commit(cpu_buffer); /* The commit page can not change after this */ #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /* * Due to the ability to swap a cpu buffer from a buffer * it is possible it was swapped before we committed. * (committing stops a swap). We check for it here and * if it happened, we have to fail the write. */ barrier(); if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) { local_dec(&cpu_buffer->committing); local_dec(&cpu_buffer->commits); return NULL; } #endif info.length = rb_calculate_event_length(length); if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; if (info.length > cpu_buffer->buffer->max_data_size) goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; } again: info.add_timestamp = add_ts_default; info.delta = 0; /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop * back here. Even with heavy interrupts happening, this * should only happen a few times in a row. If this happens * 1000 times in a row, there must be either an interrupt * storm or we have something buggy. * Bail! */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) { if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND)) info.length -= RB_LEN_TIME_EXTEND; goto again; } if (likely(event)) return event; out_fail: rb_end_commit(cpu_buffer); return NULL; } /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) * * Returns a reserved event on the ring buffer to copy directly to. * The user of this interface will need to get the body to write into * and can use the ring_buffer_event_data() interface. * * The length is the length of the data needed, not the event length * which also includes the event header. * * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. * If NULL is returned, then nothing has been allocated or locked. */ struct ring_buffer_event * ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int cpu; /* If we are tracing schedule, we don't want to recurse */ preempt_disable_notrace(); if (unlikely(atomic_read(&buffer->record_disabled))) goto out; cpu = raw_smp_processor_id(); if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask))) goto out; cpu_buffer = buffer->buffers[cpu]; if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; if (unlikely(length > buffer->max_data_size)) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) goto out; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out_unlock; return event; out_unlock: trace_recursive_unlock(cpu_buffer); out: preempt_enable_notrace(); return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer * to the page it is on. This may only be called before the commit * takes place. */ static inline void rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; struct buffer_page *bpage = cpu_buffer->commit_page; struct buffer_page *start; addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); /* Do the likely case first */ if (likely(bpage->page == (void *)addr)) { local_dec(&bpage->entries); return; } /* * Because the commit page may be on the reader page we * start with the next page and check the end loop there. */ rb_inc_page(&bpage); start = bpage; do { if (bpage->page == (void *)addr) { local_dec(&bpage->entries); return; } rb_inc_page(&bpage); } while (bpage != start); /* commit not part of this buffer?? */ RB_WARN_ON(cpu_buffer, 1); } /** * ring_buffer_discard_commit - discard an event that has not been committed * @buffer: the ring buffer * @event: non committed event to discard * * Sometimes an event that is in the ring buffer needs to be ignored. * This function lets the user discard an event in the ring buffer * and then that event will not be read later. * * This function only works if it is called before the item has been * committed. It will try to free the event from the ring buffer * if another event has not been added behind it. * * If another event has been added behind it, it will set the event * up as discarded, and perform the commit. * * If this function is called, do not call ring_buffer_unlock_commit on * the event. */ void ring_buffer_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* The event is discarded regardless */ rb_event_discard(event); cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* * This must only be called if the event has not been * committed yet. Thus we can assume that preemption * is still disabled. */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); rb_decrement_entry(cpu_buffer, event); if (rb_try_to_discard(cpu_buffer, event)) goto out; out: rb_end_commit(cpu_buffer); trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); } EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); /** * ring_buffer_write - write data to the buffer without reserving * @buffer: The ring buffer to write to. * @length: The length of the data being written (excluding the event header) * @data: The data to write to the buffer. * * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as * one function. If you already have the data to write to the buffer, it * may be easier to simply call this function. * * Note, like ring_buffer_lock_reserve, the length is the length of the data * and not the length of the event which would hold the header. */ int ring_buffer_write(struct trace_buffer *buffer, unsigned long length, void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; void *body; int ret = -EBUSY; int cpu; preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) goto out; cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->record_disabled)) goto out; if (length > buffer->max_data_size) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) goto out; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out_unlock; body = rb_event_data(event); memcpy(body, data, length); rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); ret = 0; out_unlock: trace_recursive_unlock(cpu_buffer); out: preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_write); /* * The total entries in the ring buffer is the running counter * of entries entered into the ring buffer, minus the sum of * the entries read from the ring buffer and the number of * entries that were overwritten. */ static inline unsigned long rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) { return local_read(&cpu_buffer->entries) - (local_read(&cpu_buffer->overrun) + cpu_buffer->read); } static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { return !rb_num_of_entries(cpu_buffer); } /** * ring_buffer_record_disable - stop all writes into the buffer * @buffer: The ring buffer to stop writes to. * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable(struct trace_buffer *buffer) { atomic_inc(&buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable); /** * ring_buffer_record_enable - enable writes to the buffer * @buffer: The ring buffer to enable writes * * Note, multiple disables will need the same number of enables * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable(struct trace_buffer *buffer) { atomic_dec(&buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable); /** * ring_buffer_record_off - stop all writes into the buffer * @buffer: The ring buffer to stop writes to. * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * This is different than ring_buffer_record_disable() as * it works like an on/off switch, where as the disable() version * must be paired with a enable(). */ void ring_buffer_record_off(struct trace_buffer *buffer) { unsigned int rd; unsigned int new_rd; rd = atomic_read(&buffer->record_disabled); do { new_rd = rd | RB_BUFFER_OFF; } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd)); } EXPORT_SYMBOL_GPL(ring_buffer_record_off); /** * ring_buffer_record_on - restart writes into the buffer * @buffer: The ring buffer to start writes to. * * This enables all writes to the buffer that was disabled by * ring_buffer_record_off(). * * This is different than ring_buffer_record_enable() as * it works like an on/off switch, where as the enable() version * must be paired with a disable(). */ void ring_buffer_record_on(struct trace_buffer *buffer) { unsigned int rd; unsigned int new_rd; rd = atomic_read(&buffer->record_disabled); do { new_rd = rd & ~RB_BUFFER_OFF; } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd)); } EXPORT_SYMBOL_GPL(ring_buffer_record_on); /** * ring_buffer_record_is_on - return true if the ring buffer can write * @buffer: The ring buffer to see if write is enabled * * Returns true if the ring buffer is in a state that it accepts writes. */ bool ring_buffer_record_is_on(struct trace_buffer *buffer) { return !atomic_read(&buffer->record_disabled); } /** * ring_buffer_record_is_set_on - return true if the ring buffer is set writable * @buffer: The ring buffer to see if write is set enabled * * Returns true if the ring buffer is set writable by ring_buffer_record_on(). * Note that this does NOT mean it is in a writable state. * * It may return true when the ring buffer has been disabled by * ring_buffer_record_disable(), as that is a temporary disabling of * the ring buffer. */ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) { return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); } /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); /** * ring_buffer_record_enable_cpu - enable writes to the buffer * @buffer: The ring buffer to enable writes * @cpu: The CPU to enable. * * Note, multiple disables will need the same number of enables * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; atomic_dec(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); /** * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu) { unsigned long flags; struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; u64 ret = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* * if the tail is on reader_page, oldest time stamp is on the reader * page */ if (cpu_buffer->tail_page == cpu_buffer->reader_page) bpage = cpu_buffer->reader_page; else bpage = rb_set_head_page(cpu_buffer); if (bpage) ret = bpage->page->time_stamp; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); /** * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ unsigned long ring_buffer_bytes_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; return ret; } EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to get the entries from. */ unsigned long ring_buffer_entries_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; return rb_num_of_entries(cpu_buffer); } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); /** * ring_buffer_overrun_cpu - get the number of overruns caused by the ring * buffer wrapping around (only if RB_FL_OVERWRITE is on). * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_overrun_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** * ring_buffer_commit_overrun_cpu - get the number of overruns caused by * commits failing due to the buffer wrapping around while there are uncommitted * events, such as during an interrupt storm. * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->commit_overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); /** * ring_buffer_dropped_events_cpu - get the number of dropped events caused by * the ring buffer filling up (only if RB_FL_OVERWRITE is off). * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->dropped_events); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); /** * ring_buffer_read_events_cpu - get the number of events successfully read * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of events read */ unsigned long ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; return cpu_buffer->read; } EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer * * Returns the total number of entries in the ring buffer * (all CPU entries) */ unsigned long ring_buffer_entries(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long entries = 0; int cpu; /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += rb_num_of_entries(cpu_buffer); } return entries; } EXPORT_SYMBOL_GPL(ring_buffer_entries); /** * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer * (all CPU entries) */ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long overruns = 0; int cpu; /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; overruns += local_read(&cpu_buffer->overrun); } return overruns; } EXPORT_SYMBOL_GPL(ring_buffer_overruns); static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Iterator usage is expected to have record disabled */ iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; iter->next_event = iter->head; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; iter->cache_pages_removed = cpu_buffer->pages_removed; if (iter->head) { iter->read_stamp = cpu_buffer->read_stamp; iter->page_stamp = cpu_buffer->reader_page->page->time_stamp; } else { iter->read_stamp = iter->head_page->page->time_stamp; iter->page_stamp = iter->read_stamp; } } /** * ring_buffer_iter_reset - reset an iterator * @iter: The iterator to reset * * Resets the iterator, so that it will start from the beginning * again. */ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; if (!iter) return; cpu_buffer = iter->cpu_buffer; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); /** * ring_buffer_iter_empty - check if an iterator has no more to read * @iter: The iterator to check */ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *reader; struct buffer_page *head_page; struct buffer_page *commit_page; struct buffer_page *curr_commit_page; unsigned commit; u64 curr_commit_ts; u64 commit_ts; cpu_buffer = iter->cpu_buffer; reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; commit_page = READ_ONCE(cpu_buffer->commit_page); commit_ts = commit_page->page->time_stamp; /* * When the writer goes across pages, it issues a cmpxchg which * is a mb(), which will synchronize with the rmb here. * (see rb_tail_page_update()) */ smp_rmb(); commit = rb_page_commit(commit_page); /* We want to make sure that the commit page doesn't change */ smp_rmb(); /* Make sure commit page didn't change */ curr_commit_page = READ_ONCE(cpu_buffer->commit_page); curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp); /* If the commit page changed, then there's more data */ if (curr_commit_page != commit_page || curr_commit_ts != commit_ts) return 0; /* Still racy, as it may return a false positive, but that's OK */ return ((iter->head_page == commit_page && iter->head >= commit) || (iter->head_page == reader && commit_page == head_page && head_page->read == commit && iter->head == rb_page_size(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); static void rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { u64 delta; switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp); cpu_buffer->read_stamp = delta; return; case RINGBUF_TYPE_DATA: cpu_buffer->read_stamp += event->time_delta; return; default: RB_WARN_ON(cpu_buffer, 1); } } static void rb_update_iter_read_stamp(struct ring_buffer_iter *iter, struct ring_buffer_event *event) { u64 delta; switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); delta = rb_fix_abs_ts(delta, iter->read_stamp); iter->read_stamp = delta; return; case RINGBUF_TYPE_DATA: iter->read_stamp += event->time_delta; return; default: RB_WARN_ON(iter->cpu_buffer, 1); } } static struct buffer_page * rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); unsigned long overwrite; unsigned long flags; int nr_loops = 0; bool ret; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); again: /* * This should normally only loop twice. But because the * start of the reader inserts an empty page, it causes * a case where we will loop three times. There should be no * reason to loop four times (that I know of). */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { reader = NULL; goto out; } reader = cpu_buffer->reader_page; /* If there's more to read, return this page */ if (cpu_buffer->reader_page->read < rb_page_size(reader)) goto out; /* Never should we have an index greater than the size */ if (RB_WARN_ON(cpu_buffer, cpu_buffer->reader_page->read > rb_page_size(reader))) goto out; /* check if we caught up to the tail */ reader = NULL; if (cpu_buffer->commit_page == cpu_buffer->reader_page) goto out; /* Don't bother swapping if the ring buffer is empty */ if (rb_num_of_entries(cpu_buffer) == 0) goto out; /* * Reset the reader page to size zero. */ local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->real_end = 0; spin: /* * Splice the empty reader page into the list around the head. */ reader = rb_set_head_page(cpu_buffer); if (!reader) goto out; cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; /* * cpu_buffer->pages just needs to point to the buffer, it * has no specific buffer page to point to. Lets move it out * of our way so we don't accidentally swap it. */ cpu_buffer->pages = reader->list.prev; /* The reader page will be pointing to the new head */ rb_set_list_to_head(&cpu_buffer->reader_page->list); /* * We want to make sure we read the overruns after we set up our * pointers to the next object. The writer side does a * cmpxchg to cross pages which acts as the mb on the writer * side. Note, the reader will constantly fail the swap * while the writer is updating the pointers, so this * guarantees that the overwrite recorded here is the one we * want to compare with the last_overrun. */ smp_mb(); overwrite = local_read(&(cpu_buffer->overrun)); /* * Here's the tricky part. * * We need to move the pointer past the header page. * But we can only do that if a writer is not currently * moving it. The page before the header page has the * flag bit '1' set if it is pointing to the page we want. * but if the writer is in the process of moving it * than it will be '2' or already moved '0'. */ ret = rb_head_page_replace(reader, cpu_buffer->reader_page); /* * If we did not convert it, then we must try again. */ if (!ret) goto spin; if (cpu_buffer->ring_meta) rb_update_meta_reader(cpu_buffer, reader); /* * Yay! We succeeded in replacing the page. * * Now make the new head point back to the reader page. */ rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(&cpu_buffer->head_page); cpu_buffer->cnt++; local_inc(&cpu_buffer->pages_read); /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; cpu_buffer->reader_page->read = 0; if (overwrite != cpu_buffer->last_overrun) { cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; cpu_buffer->last_overrun = overwrite; } goto again; out: /* Update the read_stamp on the first event */ if (reader && reader->read == 0) cpu_buffer->read_stamp = reader->page->time_stamp; arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); /* * The writer has preempt disable, wait for it. But not forever * Although, 1 second is pretty much "forever" */ #define USECS_WAIT 1000000 for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { /* If the write is past the end of page, a writer is still updating it */ if (likely(!reader || rb_page_write(reader) <= bsize)) break; udelay(1); /* Get the latest version of the reader write value */ smp_rmb(); } /* The writer is not moving forward? Something is wrong */ if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT)) reader = NULL; /* * Make sure we see any padding after the write update * (see rb_reset_tail()). * * In addition, a writer may be writing on the reader page * if the page has not been fully filled, so the read barrier * is also needed to make sure we see the content of what is * committed by the writer (see rb_set_commit_to_write()). */ smp_rmb(); return reader; } static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; struct buffer_page *reader; unsigned length; reader = rb_get_reader_page(cpu_buffer); /* This function should not be called when buffer is empty */ if (RB_WARN_ON(cpu_buffer, !reader)) return; event = rb_reader_event(cpu_buffer); if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); length = rb_event_length(event); cpu_buffer->reader_page->read += length; cpu_buffer->read_bytes += length; } static void rb_advance_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; cpu_buffer = iter->cpu_buffer; /* If head == next_event then we need to jump to the next event */ if (iter->head == iter->next_event) { /* If the event gets overwritten again, there's nothing to do */ if (rb_iter_head_event(iter) == NULL) return; } iter->head = iter->next_event; /* * Check if we are at the end of the buffer. */ if (iter->next_event >= rb_page_size(iter->head_page)) { /* discarded commits can make the page empty */ if (iter->head_page == cpu_buffer->commit_page) return; rb_inc_iter(iter); return; } rb_update_iter_read_stamp(iter, iter->event); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) { return cpu_buffer->lost_events; } static struct ring_buffer_event * rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; struct buffer_page *reader; int nr_loops = 0; if (ts) *ts = 0; again: /* * We repeat when a time extend is encountered. * Since the time extend is always attached to a data event, * we should never loop more than once. * (We never hit the following condition more than twice). */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; reader = rb_get_reader_page(cpu_buffer); if (!reader) return NULL; event = rb_reader_event(cpu_buffer); switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) RB_WARN_ON(cpu_buffer, 1); /* * Because the writer could be discarding every * event it creates (which would probably be bad) * if we were to go back to "again" then we may never * catch up, and will trigger the warn on, or lock * the box. Return the padding, and we will release * the current locks, and try again. */ return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_DATA: if (ts && !(*ts)) { *ts = cpu_buffer->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } if (lost_events) *lost_events = rb_lost_events(cpu_buffer); return event; default: RB_WARN_ON(cpu_buffer, 1); } return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_peek); static struct ring_buffer_event * rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { struct trace_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int nr_loops = 0; if (ts) *ts = 0; cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; /* * Check if someone performed a consuming read to the buffer * or removed some pages from the buffer. In these cases, * iterator was invalidated and we need to reset it. */ if (unlikely(iter->cache_read != cpu_buffer->read || iter->cache_reader_page != cpu_buffer->reader_page || iter->cache_pages_removed != cpu_buffer->pages_removed)) rb_iter_reset(iter); again: if (ring_buffer_iter_empty(iter)) return NULL; /* * As the writer can mess with what the iterator is trying * to read, just give up if we fail to get an event after * three tries. The iterator is not as reliable when reading * the ring buffer with an active write as the consumer is. * Do not warn if the three failures is reached. */ if (++nr_loops > 3) return NULL; if (rb_per_cpu_empty(cpu_buffer)) return NULL; if (iter->head >= rb_page_size(iter->head_page)) { rb_inc_iter(iter); goto again; } event = rb_iter_head_event(iter); if (!event) goto again; switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) { rb_inc_iter(iter); goto again; } rb_advance_iter(iter); return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_DATA: if (ts && !(*ts)) { *ts = iter->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(buffer, cpu_buffer->cpu, ts); } return event; default: RB_WARN_ON(cpu_buffer, 1); } return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer) { if (likely(!in_nmi())) { raw_spin_lock(&cpu_buffer->reader_lock); return true; } /* * If an NMI die dumps out the content of the ring buffer * trylock must be used to prevent a deadlock if the NMI * preempted a task that holds the ring buffer locks. If * we get the lock then all is fine, if not, then continue * to do the read, but this can corrupt the ring buffer, * so it must be permanently disabled from future writes. * Reading from NMI is a oneshot deal. */ if (raw_spin_trylock(&cpu_buffer->reader_lock)) return true; /* Continue without locking, but disable the ring buffer */ atomic_inc(&cpu_buffer->record_disabled); return false; } static inline void rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked) { if (likely(locked)) raw_spin_unlock(&cpu_buffer->reader_lock); } /** * ring_buffer_peek - peek at the next event to be read * @buffer: The ring buffer to read * @cpu: The cpu to peak at * @ts: The timestamp counter of this event. * @lost_events: a variable to store if events were lost (may be NULL) * * This will return the event that will be read next, but does * not consume the data. */ struct ring_buffer_event * ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; unsigned long flags; bool dolock; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; again: local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } /** ring_buffer_iter_dropped - report if there are dropped events * @iter: The ring buffer iterator * * Returns true if there was dropped events since the last peek. */ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) { bool ret = iter->missed_events != 0; iter->missed_events = 0; return ret; } EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); /** * ring_buffer_iter_peek - peek at the next event to be read * @iter: The ring buffer iterator * @ts: The timestamp counter of this event. * * This will return the event that will be read next, but does * not increment the iterator. */ struct ring_buffer_event * ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; struct ring_buffer_event *event; unsigned long flags; again: raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } /** * ring_buffer_consume - return an event and consume it * @buffer: The ring buffer to get the next event from * @cpu: the cpu to read the buffer from * @ts: a variable to store the timestamp (may be NULL) * @lost_events: a variable to store if events were lost (may be NULL) * * Returns the next event in the ring buffer, and that event is consumed. * Meaning, that sequential reads will keep returning a different event, * and eventually empty the ring buffer if the producer is slower. */ struct ring_buffer_event * ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; unsigned long flags; bool dolock; again: /* might be called in atomic */ preempt_disable(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { cpu_buffer->lost_events = 0; rb_advance_reader(cpu_buffer); } rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); out: preempt_enable(); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } EXPORT_SYMBOL_GPL(ring_buffer_consume); /** * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate * through the buffer. Memory is allocated, buffer resizing * is disabled, and the iterator pointer is returned to the caller. * * After a sequence of ring_buffer_read_prepare calls, the user is * expected to make at least one call to ring_buffer_read_prepare_sync. * Afterwards, ring_buffer_read_start is invoked to get things going * for real. * * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; iter = kzalloc(sizeof(*iter), flags); if (!iter) return NULL; /* Holds the entire event: data and meta data */ iter->event_size = buffer->subbuf_size; iter->event = kmalloc(iter->event_size, flags); if (!iter->event) { kfree(iter); return NULL; } cpu_buffer = buffer->buffers[cpu]; iter->cpu_buffer = cpu_buffer; atomic_inc(&cpu_buffer->resize_disabled); return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); /** * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls * * All previously invoked ring_buffer_read_prepare calls to prepare * iterators will be synchronized. Afterwards, read_buffer_read_start * calls on those iterators are allowed. */ void ring_buffer_read_prepare_sync(void) { synchronize_rcu(); } EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); /** * ring_buffer_read_start - start a non consuming read of the buffer * @iter: The iterator returned by ring_buffer_read_prepare * * This finalizes the startup of an iteration through the buffer. * The iterator comes from a call to ring_buffer_read_prepare and * an intervening ring_buffer_read_prepare_sync must have been * performed. * * Must be paired with ring_buffer_read_finish. */ void ring_buffer_read_start(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; if (!iter) return; cpu_buffer = iter->cpu_buffer; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_read_start); /** * ring_buffer_read_finish - finish reading the iterator of the buffer * @iter: The iterator retrieved by ring_buffer_start * * This re-enables resizing of the buffer, and frees the iterator. */ void ring_buffer_read_finish(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Use this opportunity to check the integrity of the ring buffer. */ rb_check_pages(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); kfree(iter->event); kfree(iter); } EXPORT_SYMBOL_GPL(ring_buffer_read_finish); /** * ring_buffer_iter_advance - advance the iterator to the next location * @iter: The ring buffer iterator * * Move the location of the iterator such that the next read will * be the next location of the iterator. */ void ring_buffer_iter_advance(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_advance_iter(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_advance); /** * ring_buffer_size - return the size of the ring buffer (in bytes) * @buffer: The ring buffer. * @cpu: The CPU to get ring buffer size from. */ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages; } EXPORT_SYMBOL_GPL(ring_buffer_size); /** * ring_buffer_max_event_size - return the max data size of an event * @buffer: The ring buffer. * * Returns the maximum size an event can be. */ unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer) { /* If abs timestamp is requested, events have a timestamp too */ if (ring_buffer_time_stamp_abs(buffer)) return buffer->max_data_size - RB_LEN_TIME_EXTEND; return buffer->max_data_size; } EXPORT_SYMBOL_GPL(ring_buffer_max_event_size); static void rb_clear_buffer_page(struct buffer_page *page) { local_set(&page->write, 0); local_set(&page->entries, 0); rb_init_page(page->page); page->read = 0; } static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; if (!meta) return; meta->reader.read = cpu_buffer->reader_page->read; meta->reader.id = cpu_buffer->reader_page->id; meta->reader.lost_events = cpu_buffer->lost_events; meta->entries = local_read(&cpu_buffer->entries); meta->overrun = local_read(&cpu_buffer->overrun); meta->read = cpu_buffer->read; /* Some archs do not have data cache coherency between kernel and user-space */ flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); } static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); rb_clear_buffer_page(cpu_buffer->head_page); list_for_each_entry(page, cpu_buffer->pages, list) { rb_clear_buffer_page(page); } cpu_buffer->tail_page = cpu_buffer->head_page; cpu_buffer->commit_page = cpu_buffer->head_page; INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); rb_clear_buffer_page(cpu_buffer->reader_page); local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->commit_overrun, 0); local_set(&cpu_buffer->dropped_events, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); local_set(&cpu_buffer->pages_touched, 0); local_set(&cpu_buffer->pages_lost, 0); local_set(&cpu_buffer->pages_read, 0); cpu_buffer->last_pages_touch = 0; cpu_buffer->shortest_full = 0; cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; rb_time_set(&cpu_buffer->write_stamp, 0); rb_time_set(&cpu_buffer->before_stamp, 0); memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp)); cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; rb_head_page_activate(cpu_buffer); cpu_buffer->pages_removed = 0; if (cpu_buffer->mapped) { rb_update_meta_page(cpu_buffer); if (cpu_buffer->ring_meta) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = meta->head_buffer; } } } /* Must have disabled the cpu buffer then done a synchronize_rcu */ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); arch_spin_unlock(&cpu_buffer->lock); out: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of * @cpu: The CPU buffer to be reset */ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_meta *meta; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); /* Make sure all commits have finished */ synchronize_rcu(); reset_disabled_cpu_buffer(cpu_buffer); atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); /* Make sure persistent meta now uses this buffer's addresses */ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); if (meta) rb_meta_init_text_addr(meta); mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /* Flag to ensure proper resetting of atomic variables */ #define RESET_BIT (1 << 30) /** * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of */ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_meta *meta; int cpu; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); for_each_online_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; atomic_add(RESET_BIT, &cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); } /* Make sure all commits have finished */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; /* * If a CPU came online during the synchronize_rcu(), then * ignore it. */ if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT)) continue; reset_disabled_cpu_buffer(cpu_buffer); /* Make sure persistent meta now uses this buffer's addresses */ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); if (meta) rb_meta_init_text_addr(meta); atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } mutex_unlock(&buffer->mutex); } /** * ring_buffer_reset - reset a ring buffer * @buffer: The ring buffer to reset all cpu buffers */ void ring_buffer_reset(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); } /* Make sure all commits have finished */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; reset_disabled_cpu_buffer(cpu_buffer); atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); } mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset); /** * ring_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ bool ring_buffer_empty(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; bool dolock; bool ret; int cpu; /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (!ret) return false; } return true; } EXPORT_SYMBOL_GPL(ring_buffer_empty); /** * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? * @buffer: The ring buffer * @cpu: The CPU buffer to test */ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; bool dolock; bool ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return true; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers * @buffer_a: One buffer to swap with * @buffer_b: The other buffer to swap with * @cpu: the CPU of the buffers to swap * * This function is useful for tracers that want to take a "snapshot" * of a CPU buffer and has another back up buffer lying around. * it is expected that the tracer handles the cpu buffer not being * used at the moment. */ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, struct trace_buffer *buffer_b, int cpu) { struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; int ret = -EINVAL; if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) goto out; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; /* It's up to the callers to not try to swap mapped buffers */ if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { ret = -EBUSY; goto out; } /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; if (buffer_a->subbuf_order != buffer_b->subbuf_order) goto out; ret = -EAGAIN; if (atomic_read(&buffer_a->record_disabled)) goto out; if (atomic_read(&buffer_b->record_disabled)) goto out; if (atomic_read(&cpu_buffer_a->record_disabled)) goto out; if (atomic_read(&cpu_buffer_b->record_disabled)) goto out; /* * We can't do a synchronize_rcu here because this * function can be called in atomic context. * Normally this will be called from the same CPU as cpu. * If not it's up to the caller to protect this. */ atomic_inc(&cpu_buffer_a->record_disabled); atomic_inc(&cpu_buffer_b->record_disabled); ret = -EBUSY; if (local_read(&cpu_buffer_a->committing)) goto out_dec; if (local_read(&cpu_buffer_b->committing)) goto out_dec; /* * When resize is in progress, we cannot swap it because * it will mess the state of the cpu buffer. */ if (atomic_read(&buffer_a->resizing)) goto out_dec; if (atomic_read(&buffer_b->resizing)) goto out_dec; buffer_a->buffers[cpu] = cpu_buffer_b; buffer_b->buffers[cpu] = cpu_buffer_a; cpu_buffer_b->buffer = buffer_a; cpu_buffer_a->buffer = buffer_b; ret = 0; out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); #endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. * @cpu: the cpu buffer to allocate. * * This function is used in conjunction with ring_buffer_read_page. * When reading a full page from the ring buffer, these functions * can be used to speed up the process. The calling function should * allocate a few pages first with this function. Then when it * needs to get pages from the ring buffer, it passes the result * of this function into ring_buffer_read_page, which will swap * the page that was allocated, with the read page of the buffer. * * Returns: * The page allocated, or ERR_PTR */ struct buffer_data_read_page * ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_read_page *bpage = NULL; unsigned long flags; struct page *page; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); bpage = kzalloc(sizeof(*bpage), GFP_KERNEL); if (!bpage) return ERR_PTR(-ENOMEM); bpage->order = buffer->subbuf_order; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (cpu_buffer->free_page) { bpage->data = cpu_buffer->free_page; cpu_buffer->free_page = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); if (bpage->data) goto out; page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) { kfree(bpage); return ERR_PTR(-ENOMEM); } bpage->data = page_address(page); out: rb_init_page(bpage->data); return bpage; } EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for * @cpu: the cpu buffer the page came from * @data_page: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, struct buffer_data_read_page *data_page) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = data_page->data; struct page *page = virt_to_page(bpage); unsigned long flags; if (!buffer || !buffer->buffers || !buffer->buffers[cpu]) return; cpu_buffer = buffer->buffers[cpu]; /* * If the page is still in use someplace else, or order of the page * is different from the subbuffer order of the buffer - * we can't reuse it */ if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order) goto out; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (!cpu_buffer->free_page) { cpu_buffer->free_page = bpage; bpage = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); out: free_pages((unsigned long)bpage, data_page->order); kfree(data_page); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); /** * ring_buffer_read_page - extract a page from the ring buffer * @buffer: buffer to extract from * @data_page: the page to use allocated from ring_buffer_alloc_read_page * @len: amount to extract * @cpu: the cpu of the buffer to extract * @full: should the extraction only happen when the page is full. * * This function will pull out a page from the ring buffer and consume it. * @data_page must be the address of the variable that was returned * from ring_buffer_alloc_read_page. This is because the page might be used * to swap with a page in the ring buffer. * * for example: * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (IS_ERR(rpage)) * return PTR_ERR(rpage); * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0); * if (ret >= 0) * process_page(ring_buffer_read_page_data(rpage), ret); * ring_buffer_free_read_page(buffer, cpu, rpage); * * When @full is set, the function will not return true unless * the writer is off the reader page. * * Note: it is up to the calling functions to handle sleeps and wakeups. * The ring buffer can be used anywhere in the kernel and can not * blindly call wake_up. The layer that uses the ring buffer must be * responsible for that. * * Returns: * >=0 if data has been transferred, returns the offset of consumed data. * <0 if no data has been transferred. */ int ring_buffer_read_page(struct trace_buffer *buffer, struct buffer_data_read_page *data_page, size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; struct buffer_data_page *bpage; struct buffer_page *reader; unsigned long missed_events; unsigned long flags; unsigned int commit; unsigned int read; u64 save_timestamp; int ret = -1; if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) goto out; len -= BUF_PAGE_HDR_SIZE; if (!data_page || !data_page->data) goto out; if (data_page->order != buffer->subbuf_order) goto out; bpage = data_page->data; if (!bpage) goto out; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) goto out_unlock; event = rb_reader_event(cpu_buffer); read = reader->read; commit = rb_page_size(reader); /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; /* * If this page has been partially read or * if len is not big enough to read the rest of the page or * a writer is still on the page, then * we must copy the data from the page to the buffer. * Otherwise, we can simply swap the page with the one passed in. */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || cpu_buffer->mapped) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; unsigned int size; /* * If a full page is expected, this can still be returned * if there's been a previous partial read and the * rest of the page can be read and the commit page is off * the reader page. */ if (full && (!read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) goto out_unlock; if (len > (commit - read)) len = (commit - read); /* Always keep the time extend and data together */ size = rb_event_ts_length(event); if (len < size) goto out_unlock; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; /* Need to copy one event at a time */ do { /* We need the size of one event, because * rb_advance_reader only advances by one event, * whereas rb_event_ts_length may include the size of * one or two events. * We have already ensured there's enough space if this * is a time extend. */ size = rb_event_length(event); memcpy(bpage->data + pos, rpage->data + rpos, size); len -= size; rb_advance_reader(cpu_buffer); rpos = reader->read; pos += size; if (rpos >= commit) break; event = rb_reader_event(cpu_buffer); /* Always keep the time extend and data together */ size = rb_event_ts_length(event); } while (len >= size); /* update bpage */ local_set(&bpage->commit, pos); bpage->time_stamp = save_timestamp; /* we copied everything to the beginning */ read = 0; } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); cpu_buffer->read_bytes += rb_page_size(reader); /* swap the pages */ rb_init_page(bpage); bpage = reader->page; reader->page = data_page->data; local_set(&reader->write, 0); local_set(&reader->entries, 0); reader->read = 0; data_page->data = bpage; /* * Use the real_end for the data size, * This gives us a chance to store the lost events * on the page. */ if (reader->real_end) local_set(&bpage->commit, reader->real_end); } ret = read; cpu_buffer->lost_events = 0; commit = local_read(&bpage->commit); /* * Set a flag in the commit field if we lost events */ if (missed_events) { /* If there is room at the end of the page to save the * missed events, then record it there. */ if (buffer->subbuf_size - commit >= sizeof(missed_events)) { memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); commit += sizeof(missed_events); } local_add(RB_MISSED_EVENTS, &bpage->commit); } /* * This page may be off to user land. Zero it out here. */ if (commit < buffer->subbuf_size) memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); out_unlock: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_read_page); /** * ring_buffer_read_page_data - get pointer to the data in the page. * @page: the page to get the data from * * Returns pointer to the actual data in this page. */ void *ring_buffer_read_page_data(struct buffer_data_read_page *page) { return page->data; } EXPORT_SYMBOL_GPL(ring_buffer_read_page_data); /** * ring_buffer_subbuf_size_get - get size of the sub buffer. * @buffer: the buffer to get the sub buffer size from * * Returns size of the sub buffer, in bytes. */ int ring_buffer_subbuf_size_get(struct trace_buffer *buffer) { return buffer->subbuf_size + BUF_PAGE_HDR_SIZE; } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get); /** * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page. * @buffer: The ring_buffer to get the system sub page order from * * By default, one ring buffer sub page equals to one system page. This parameter * is configurable, per ring buffer. The size of the ring buffer sub page can be * extended, but must be an order of system page size. * * Returns the order of buffer sub page size, in system pages: * 0 means the sub buffer size is 1 system page and so forth. * In case of an error < 0 is returned. */ int ring_buffer_subbuf_order_get(struct trace_buffer *buffer) { if (!buffer) return -EINVAL; return buffer->subbuf_order; } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get); /** * ring_buffer_subbuf_order_set - set the size of ring buffer sub page. * @buffer: The ring_buffer to set the new page size. * @order: Order of the system pages in one sub buffer page * * By default, one ring buffer pages equals to one system page. This API can be * used to set new size of the ring buffer page. The size must be order of * system page size, that's why the input parameter @order is the order of * system pages that are allocated for one ring buffer page: * 0 - 1 system page * 1 - 2 system pages * 3 - 4 system pages * ... * * Returns 0 on success or < 0 in case of an error. */ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage, *tmp; int old_order, old_size; int nr_pages; int psize; int err; int cpu; if (!buffer || order < 0) return -EINVAL; if (buffer->subbuf_order == order) return 0; psize = (1 << order) * PAGE_SIZE; if (psize <= BUF_PAGE_HDR_SIZE) return -EINVAL; /* Size of a subbuf cannot be greater than the write counter */ if (psize > RB_WRITE_MASK + 1) return -EINVAL; old_order = buffer->subbuf_order; old_size = buffer->subbuf_size; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->record_disabled); /* Make sure all commits have finished */ synchronize_rcu(); buffer->subbuf_order = order; buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE; /* Make sure all new buffers are allocated, before deleting the old ones */ for_each_buffer_cpu(buffer, cpu) { if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; if (cpu_buffer->mapped) { err = -EBUSY; goto error; } /* Update the number of pages to match the new size */ nr_pages = old_size * buffer->buffers[cpu]->nr_pages; nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); /* we need a minimum of two pages */ if (nr_pages < 2) nr_pages = 2; cpu_buffer->nr_pages_to_update = nr_pages; /* Include the reader page */ nr_pages++; /* Allocate the new size buffer */ INIT_LIST_HEAD(&cpu_buffer->new_pages); if (__rb_allocate_pages(cpu_buffer, nr_pages, &cpu_buffer->new_pages)) { /* not enough memory for new pages */ err = -ENOMEM; goto error; } } for_each_buffer_cpu(buffer, cpu) { struct buffer_data_page *old_free_data_page; struct list_head old_pages; unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* Clear the head bit to make the link list normal to read */ rb_head_page_deactivate(cpu_buffer); /* * Collect buffers from the cpu_buffer pages list and the * reader_page on old_pages, so they can be freed later when not * under a spinlock. The pages list is a linked list with no * head, adding old_pages turns it into a regular list with * old_pages being the head. */ list_add(&old_pages, cpu_buffer->pages); list_add(&cpu_buffer->reader_page->list, &old_pages); /* One page was allocated for the reader page */ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, struct buffer_page, list); list_del_init(&cpu_buffer->reader_page->list); /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; list_del_init(&cpu_buffer->new_pages); cpu_buffer->cnt++; cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; cpu_buffer->nr_pages_to_update = 0; old_free_data_page = cpu_buffer->free_page; cpu_buffer->free_page = NULL; rb_head_page_activate(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); /* Free old sub buffers */ list_for_each_entry_safe(bpage, tmp, &old_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } free_pages((unsigned long)old_free_data_page, old_order); rb_check_pages(cpu_buffer); } atomic_dec(&buffer->record_disabled); mutex_unlock(&buffer->mutex); return 0; error: buffer->subbuf_order = old_order; buffer->subbuf_size = old_size; atomic_dec(&buffer->record_disabled); mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!cpu_buffer->nr_pages_to_update) continue; list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } } return err; } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct page *page; if (cpu_buffer->meta_page) return 0; page = alloc_page(GFP_USER | __GFP_ZERO); if (!page) return -ENOMEM; cpu_buffer->meta_page = page_to_virt(page); return 0; } static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long addr = (unsigned long)cpu_buffer->meta_page; free_page(addr); cpu_buffer->meta_page = NULL; } static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, unsigned long *subbuf_ids) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; struct buffer_page *first_subbuf, *subbuf; int id = 0; subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; cpu_buffer->reader_page->id = id++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); do { if (WARN_ON(id >= nr_subbufs)) break; subbuf_ids[id] = (unsigned long)subbuf->page; subbuf->id = id; rb_inc_page(&subbuf); id++; } while (subbuf != first_subbuf); /* install subbuf ID to kern VA translation */ cpu_buffer->subbuf_ids = subbuf_ids; meta->meta_struct_len = sizeof(*meta); meta->nr_subbufs = nr_subbufs; meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; meta->meta_page_size = meta->subbuf_size; rb_update_meta_page(cpu_buffer); } static struct ring_buffer_per_cpu * rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-EINVAL); cpu_buffer = buffer->buffers[cpu]; mutex_lock(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { mutex_unlock(&cpu_buffer->mapping_lock); return ERR_PTR(-ENODEV); } return cpu_buffer; } static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) { mutex_unlock(&cpu_buffer->mapping_lock); } /* * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need * to be set-up or torn-down. */ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, bool inc) { unsigned long flags; lockdep_assert_held(&cpu_buffer->mapping_lock); /* mapped is always greater or equal to user_mapped */ if (WARN_ON(cpu_buffer->mapped < cpu_buffer->user_mapped)) return -EINVAL; if (inc && cpu_buffer->mapped == UINT_MAX) return -EBUSY; if (WARN_ON(!inc && cpu_buffer->user_mapped == 0)) return -EINVAL; mutex_lock(&cpu_buffer->buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (inc) { cpu_buffer->user_mapped++; cpu_buffer->mapped++; } else { cpu_buffer->user_mapped--; cpu_buffer->mapped--; } raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); mutex_unlock(&cpu_buffer->buffer->mutex); return 0; } /* * +--------------+ pgoff == 0 * | meta page | * +--------------+ pgoff == 1 * | subbuffer 0 | * | | * +--------------+ pgoff == (1 + (1 << subbuf_order)) * | subbuffer 1 | * | | * ... */ #ifdef CONFIG_MMU static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct vm_area_struct *vma) { unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; unsigned int subbuf_pages, subbuf_order; struct page **pages; int p = 0, s = 0; int err; /* Refuse MP_PRIVATE or writable mappings */ if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || !(vma->vm_flags & VM_MAYSHARE)) return -EPERM; subbuf_order = cpu_buffer->buffer->subbuf_order; subbuf_pages = 1 << subbuf_order; if (subbuf_order && pgoff % subbuf_pages) return -EINVAL; /* * Make sure the mapping cannot become writable later. Also tell the VM * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). */ vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP, VM_MAYWRITE); lockdep_assert_held(&cpu_buffer->mapping_lock); nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ nr_pages = ((nr_subbufs + 1) << subbuf_order); /* + meta-page */ if (nr_pages <= pgoff) return -EINVAL; nr_pages -= pgoff; nr_vma_pages = vma_pages(vma); if (!nr_vma_pages || nr_vma_pages > nr_pages) return -EINVAL; nr_pages = nr_vma_pages; pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); if (!pages) return -ENOMEM; if (!pgoff) { unsigned long meta_page_padding; pages[p++] = virt_to_page(cpu_buffer->meta_page); /* * Pad with the zero-page to align the meta-page with the * sub-buffers. */ meta_page_padding = subbuf_pages - 1; while (meta_page_padding-- && p < nr_pages) { unsigned long __maybe_unused zero_addr = vma->vm_start + (PAGE_SIZE * p); pages[p++] = ZERO_PAGE(zero_addr); } } else { /* Skip the meta-page */ pgoff -= subbuf_pages; s += pgoff / subbuf_pages; } while (p < nr_pages) { struct page *page; int off = 0; if (WARN_ON_ONCE(s >= nr_subbufs)) { err = -EINVAL; goto out; } page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); for (; off < (1 << (subbuf_order)); off++, page++) { if (p >= nr_pages) break; pages[p++] = page; } s++; } err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); out: kfree(pages); return err; } #else static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct vm_area_struct *vma) { return -EOPNOTSUPP; } #endif int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags, *subbuf_ids; int err = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; mutex_lock(&cpu_buffer->mapping_lock); if (cpu_buffer->user_mapped) { err = __rb_map_vma(cpu_buffer, vma); if (!err) err = __rb_inc_dec_mapped(cpu_buffer, true); mutex_unlock(&cpu_buffer->mapping_lock); return err; } /* prevent another thread from changing buffer/sub-buffer sizes */ mutex_lock(&buffer->mutex); err = rb_alloc_meta_page(cpu_buffer); if (err) goto unlock; /* subbuf_ids include the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); err = -ENOMEM; goto unlock; } atomic_inc(&cpu_buffer->resize_disabled); /* * Lock all readers to block any subbuf swap until the subbuf IDs are * assigned. */ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); err = __rb_map_vma(cpu_buffer, vma); if (!err) { raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the first time it is mapped by user */ cpu_buffer->mapped++; cpu_buffer->user_mapped = 1; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } else { kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); } unlock: mutex_unlock(&buffer->mutex); mutex_unlock(&cpu_buffer->mapping_lock); return err; } int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; int err = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; mutex_lock(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { err = -ENODEV; goto out; } else if (cpu_buffer->user_mapped > 1) { __rb_inc_dec_mapped(cpu_buffer, false); goto out; } mutex_lock(&buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the last user space mapping */ if (!WARN_ON_ONCE(cpu_buffer->mapped < cpu_buffer->user_mapped)) cpu_buffer->mapped--; cpu_buffer->user_mapped = 0; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); mutex_unlock(&buffer->mutex); out: mutex_unlock(&cpu_buffer->mapping_lock); return err; } int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *reader; unsigned long missed_events; unsigned long reader_size; unsigned long flags; cpu_buffer = rb_get_mapped_buffer(buffer, cpu); if (IS_ERR(cpu_buffer)) return (int)PTR_ERR(cpu_buffer); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); consume: if (rb_per_cpu_empty(cpu_buffer)) goto out; reader_size = rb_page_size(cpu_buffer->reader_page); /* * There are data to be read on the current reader page, we can * return to the caller. But before that, we assume the latter will read * everything. Let's update the kernel reader accordingly. */ if (cpu_buffer->reader_page->read < reader_size) { while (cpu_buffer->reader_page->read < reader_size) rb_advance_reader(cpu_buffer); goto out; } reader = rb_get_reader_page(cpu_buffer); if (WARN_ON(!reader)) goto out; /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; if (cpu_buffer->reader_page != cpu_buffer->commit_page) { if (missed_events) { struct buffer_data_page *bpage = reader->page; unsigned int commit; /* * Use the real_end for the data size, * This gives us a chance to store the lost events * on the page. */ if (reader->real_end) local_set(&bpage->commit, reader->real_end); /* * If there is room at the end of the page to save the * missed events, then record it there. */ commit = rb_page_size(reader); if (buffer->subbuf_size - commit >= sizeof(missed_events)) { memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); } local_add(RB_MISSED_EVENTS, &bpage->commit); } } else { /* * There really shouldn't be any missed events if the commit * is on the reader page. */ WARN_ON_ONCE(missed_events); } cpu_buffer->lost_events = 0; goto consume; out: /* Some archs do not have data cache coherency between kernel and user-space */ flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); rb_update_meta_page(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); rb_put_mapped_buffer(cpu_buffer); return 0; } /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in * the buffer. */ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) { struct trace_buffer *buffer; long nr_pages_same; int cpu_i; unsigned long nr_pages; buffer = container_of(node, struct trace_buffer, node); if (cpumask_test_cpu(cpu, buffer->cpumask)) return 0; nr_pages = 0; nr_pages_same = 1; /* check if all cpu sizes are same */ for_each_buffer_cpu(buffer, cpu_i) { /* fill in the size from first enabled cpu */ if (nr_pages == 0) nr_pages = buffer->buffers[cpu_i]->nr_pages; if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { nr_pages_same = 0; break; } } /* allocate minimum pages, user can later expand it */ if (!nr_pages_same) nr_pages = 2; buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) { WARN(1, "failed to allocate ring buffer on CPU %u\n", cpu); return -ENOMEM; } smp_wmb(); cpumask_set_cpu(cpu, buffer->cpumask); return 0; } #ifdef CONFIG_RING_BUFFER_STARTUP_TEST /* * This is a basic integrity check of the ring buffer. * Late in the boot cycle this test will run when configured in. * It will kick off a thread per CPU that will go into a loop * writing to the per cpu ring buffer various sizes of data. * Some of the data will be large items, some small. * * Another thread is created that goes into a spin, sending out * IPIs to the other CPUs to also write into the ring buffer. * this is to test the nesting ability of the buffer. * * Basic stats are recorded and reported. If something in the * ring buffer should happen that's not expected, a big warning * is displayed and all ring buffers are disabled. */ static struct task_struct *rb_threads[NR_CPUS] __initdata; struct rb_test_data { struct trace_buffer *buffer; unsigned long events; unsigned long bytes_written; unsigned long bytes_alloc; unsigned long bytes_dropped; unsigned long events_nested; unsigned long bytes_written_nested; unsigned long bytes_alloc_nested; unsigned long bytes_dropped_nested; int min_size_nested; int max_size_nested; int max_size; int min_size; int cpu; int cnt; }; static struct rb_test_data rb_data[NR_CPUS] __initdata; /* 1 meg per cpu */ #define RB_TEST_BUFFER_SIZE 1048576 static char rb_string[] __initdata = "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; static bool rb_test_started __initdata; struct rb_item { int size; char str[]; }; static __init int rb_write_something(struct rb_test_data *data, bool nested) { struct ring_buffer_event *event; struct rb_item *item; bool started; int event_len; int size; int len; int cnt; /* Have nested writes different that what is written */ cnt = data->cnt + (nested ? 27 : 0); /* Multiply cnt by ~e, to make some unique increment */ size = (cnt * 68 / 25) % (sizeof(rb_string) - 1); len = size + sizeof(struct rb_item); started = rb_test_started; /* read rb_test_started before checking buffer enabled */ smp_rmb(); event = ring_buffer_lock_reserve(data->buffer, len); if (!event) { /* Ignore dropped events before test starts. */ if (started) { if (nested) data->bytes_dropped += len; else data->bytes_dropped_nested += len; } return len; } event_len = ring_buffer_event_length(event); if (RB_WARN_ON(data->buffer, event_len < len)) goto out; item = ring_buffer_event_data(event); item->size = size; memcpy(item->str, rb_string, size); if (nested) { data->bytes_alloc_nested += event_len; data->bytes_written_nested += len; data->events_nested++; if (!data->min_size_nested || len < data->min_size_nested) data->min_size_nested = len; if (len > data->max_size_nested) data->max_size_nested = len; } else { data->bytes_alloc += event_len; data->bytes_written += len; data->events++; if (!data->min_size || len < data->min_size) data->max_size = len; if (len > data->max_size) data->max_size = len; } out: ring_buffer_unlock_commit(data->buffer); return 0; } static __init int rb_test(void *arg) { struct rb_test_data *data = arg; while (!kthread_should_stop()) { rb_write_something(data, false); data->cnt++; set_current_state(TASK_INTERRUPTIBLE); /* Now sleep between a min of 100-300us and a max of 1ms */ usleep_range(((data->cnt % 3) + 1) * 100, 1000); } return 0; } static __init void rb_ipi(void *ignore) { struct rb_test_data *data; int cpu = smp_processor_id(); data = &rb_data[cpu]; rb_write_something(data, true); } static __init int rb_hammer_test(void *arg) { while (!kthread_should_stop()) { /* Send an IPI to all cpus to write data! */ smp_call_function(rb_ipi, NULL, 1); /* No sleep, but for non preempt, let others run */ schedule(); } return 0; } static __init int test_ringbuffer(void) { struct task_struct *rb_hammer; struct trace_buffer *buffer; int cpu; int ret = 0; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Lockdown is enabled, skipping ring buffer tests\n"); return 0; } pr_info("Running ring buffer tests...\n"); buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); if (WARN_ON(!buffer)) return 0; /* Disable buffer so that threads can't write to it yet */ ring_buffer_record_off(buffer); for_each_online_cpu(cpu) { rb_data[cpu].buffer = buffer; rb_data[cpu].cpu = cpu; rb_data[cpu].cnt = cpu; rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu], cpu, "rbtester/%u"); if (WARN_ON(IS_ERR(rb_threads[cpu]))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_threads[cpu]); goto out_free; } } /* Now create the rb hammer! */ rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); if (WARN_ON(IS_ERR(rb_hammer))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_hammer); goto out_free; } ring_buffer_record_on(buffer); /* * Show buffer is enabled before setting rb_test_started. * Yes there's a small race window where events could be * dropped and the thread wont catch it. But when a ring * buffer gets enabled, there will always be some kind of * delay before other CPUs see it. Thus, we don't care about * those dropped events. We care about events dropped after * the threads see that the buffer is active. */ smp_wmb(); rb_test_started = true; set_current_state(TASK_INTERRUPTIBLE); /* Just run for 10 seconds */; schedule_timeout(10 * HZ); kthread_stop(rb_hammer); out_free: for_each_online_cpu(cpu) { if (!rb_threads[cpu]) break; kthread_stop(rb_threads[cpu]); } if (ret) { ring_buffer_free(buffer); return ret; } /* Report! */ pr_info("finished\n"); for_each_online_cpu(cpu) { struct ring_buffer_event *event; struct rb_test_data *data = &rb_data[cpu]; struct rb_item *item; unsigned long total_events; unsigned long total_dropped; unsigned long total_written; unsigned long total_alloc; unsigned long total_read = 0; unsigned long total_size = 0; unsigned long total_len = 0; unsigned long total_lost = 0; unsigned long lost; int big_event_size; int small_event_size; ret = -1; total_events = data->events + data->events_nested; total_written = data->bytes_written + data->bytes_written_nested; total_alloc = data->bytes_alloc + data->bytes_alloc_nested; total_dropped = data->bytes_dropped + data->bytes_dropped_nested; big_event_size = data->max_size + data->max_size_nested; small_event_size = data->min_size + data->min_size_nested; pr_info("CPU %d:\n", cpu); pr_info(" events: %ld\n", total_events); pr_info(" dropped bytes: %ld\n", total_dropped); pr_info(" alloced bytes: %ld\n", total_alloc); pr_info(" written bytes: %ld\n", total_written); pr_info(" biggest event: %d\n", big_event_size); pr_info(" smallest event: %d\n", small_event_size); if (RB_WARN_ON(buffer, total_dropped)) break; ret = 0; while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { total_lost += lost; item = ring_buffer_event_data(event); total_len += ring_buffer_event_length(event); total_size += item->size + sizeof(struct rb_item); if (memcmp(&item->str[0], rb_string, item->size) != 0) { pr_info("FAILED!\n"); pr_info("buffer had: %.*s\n", item->size, item->str); pr_info("expected: %.*s\n", item->size, rb_string); RB_WARN_ON(buffer, 1); ret = -1; break; } total_read++; } if (ret) break; ret = -1; pr_info(" read events: %ld\n", total_read); pr_info(" lost events: %ld\n", total_lost); pr_info(" total events: %ld\n", total_lost + total_read); pr_info(" recorded len bytes: %ld\n", total_len); pr_info(" recorded size bytes: %ld\n", total_size); if (total_lost) { pr_info(" With dropped events, record len and size may not match\n" " alloced and written from above\n"); } else { if (RB_WARN_ON(buffer, total_len != total_alloc || total_size != total_written)) break; } if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) break; ret = 0; } if (!ret) pr_info("Ring buffer PASSED!\n"); ring_buffer_free(buffer); return 0; } late_initcall(test_ringbuffer); #endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
8 193 6 191 1 6 11 116 217 217 8 8 8 108 210 108 20 12 112 8 8 297 297 10 297 290 10 297 297 8 7 1 17 17 17 17 17 1 17 108 108 108 1 107 90 7 107 106 107 10 108 108 107 3 210 210 210 108 210 210 210 213 1 145 3 67 210 108 108 11 6 5 12 12 23 11 12 8 8 8 210 210 210 65 65 290 283 8 291 319 319 255 65 6 2 2 249 9 240 15 3 230 191 59 50 83 235 76 188 10 8 205 8 200 3 2 206 23 23 180 8 8 8 77 61 137 133 4 93 49 2 3 44 137 44 66 4 53 3 2 116 6 93 3 28 5 44 4 2 31 31 118 117 1 114 3 95 96 207 3 163 1 162 60 117 2 48 30 128 128 128 6 115 7 2 119 6 124 48 67 115 11 6 5 126 76 12 160 5 5 49 37 2 4 3 37 27 14 3 4 7 4 7 2 9 99 35 11 8 2 6 1 8 27 36 36 10 140 129 128 129 61 99 38 7 13 77 24 4 54 5 5 46 3 6 83 12 82 83 2 53 38 16 29 25 3 28 3 23 8 96 97 93 11 150 25 129 14 20 17 9 6 14 1 1 642 605 4 17 5 2 4 3 3 3 66 66 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET4: Implementation of BSD Unix domain sockets. * * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> * * Fixes: * Linus Torvalds : Assorted bug cures. * Niibe Yutaka : async I/O support. * Carsten Paeth : PF_UNIX check, address fixes. * Alan Cox : Limit size of allocated blocks. * Alan Cox : Fixed the stupid socketpair bug. * Alan Cox : BSD compatibility fine tuning. * Alan Cox : Fixed a bug in connect when interrupted. * Alan Cox : Sorted out a proper draft version of * file descriptor passing hacked up from * Mike Shaver's work. * Marty Leisner : Fixes to fd passing * Nick Nevin : recvmsg bugfix. * Alan Cox : Started proper garbage collector * Heiko EiBfeldt : Missing verify_area check * Alan Cox : Started POSIXisms * Andreas Schwab : Replace inode by dentry for proper * reference counting * Kirk Petersen : Made this a module * Christoph Rohland : Elegant non-blocking accept/connect algorithm. * Lots of bug fixes. * Alexey Kuznetosv : Repaired (I hope) bugs introduces * by above two patches. * Andrea Arcangeli : If possible we block in connect(2) * if the max backlog of the listen socket * is been reached. This won't break * old apps and it will avoid huge amount * of socks hashed (this for unix_gc() * performances reasons). * Security fix that limits the max * number of socks to 2*max_files and * the number of skb queueable in the * dgram receiver. * Artur Skawina : Hash function optimizations * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) * Malcolm Beattie : Set peercred for socketpair * Michal Ostrowski : Module initialization cleanup. * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, * the core infrastructure is doing that * for all net proto families now (2.5.69+) * * Known differences from reference BSD that was tested: * * [TO FIX] * ECONNREFUSED is not returned from one end of a connected() socket to the * other the moment one end closes. * fstat() doesn't return st_dev=0, and give the blksize as high water mark * and a fake inode identifier (nor the BSD first socket fstat twice bug). * [NOT TO FIX] * accept() returns a path name even if the connecting socket has closed * in the meantime (BSD loses the path and gives up). * accept() returns 0 length path for an unbound connector. BSD returns 16 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) * socketpair(...SOCK_RAW..) doesn't panic the kernel. * BSD af_unix apparently has connect forgetting to block properly. * (need to check this with the POSIX spec in detail) * * Differences from 2.0.0-11-... (ANK) * Bug fixes and improvements. * - client shutdown killed server socket. * - removed all useless cli/sti pairs. * * Semantic changes/extensions. * - generic control message passing. * - SCM_CREDENTIALS control message. * - "Abstract" (not FS based) socket bindings. * Abstract names are sequences of bytes (not zero terminated) * started by 0, so that this name space does not intersect * with BSD names. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/fcntl.h> #include <linux/filter.h> #include <linux/termios.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/af_unix.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/scm.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/rtnetlink.h> #include <linux/mount.h> #include <net/checksum.h> #include <linux/security.h> #include <linux/splice.h> #include <linux/freezer.h> #include <linux/file.h> #include <linux/btf_ids.h> #include <linux/bpf-cgroup.h> static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; /* SMP locking strategy: * hash table is protected with spinlock. * each socket state is protected by separate spinlock. */ #ifdef CONFIG_PROVE_LOCKING #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) static int unix_table_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { return cmp_ptr(a, b); } static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, const struct lockdep_map *_b) { const struct unix_sock *a, *b; a = container_of(_a, struct unix_sock, lock.dep_map); b = container_of(_b, struct unix_sock, lock.dep_map); if (a->sk.sk_state == TCP_LISTEN) { /* unix_stream_connect(): Before the 2nd unix_state_lock(), * * 1. a is TCP_LISTEN. * 2. b is not a. * 3. concurrent connect(b -> a) must fail. * * Except for 2. & 3., the b's state can be any possible * value due to concurrent connect() or listen(). * * 2. is detected in debug_spin_lock_before(), and 3. cannot * be expressed as lock_cmp_fn. */ switch (b->sk.sk_state) { case TCP_CLOSE: case TCP_ESTABLISHED: case TCP_LISTEN: return -1; default: /* Invalid case. */ return 0; } } /* Should never happen. Just to be symmetric. */ if (b->sk.sk_state == TCP_LISTEN) { switch (b->sk.sk_state) { case TCP_CLOSE: case TCP_ESTABLISHED: return 1; default: return 0; } } /* unix_state_double_lock(): ascending address order. */ return cmp_ptr(a, b); } static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, const struct lockdep_map *_b) { const struct sock *a, *b; a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); /* unix_collect_skb(): listener -> embryo order. */ if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) return -1; /* Should never happen. Just to be symmetric. */ if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) return 1; return 0; } #endif static unsigned int unix_unbound_hash(struct sock *sk) { unsigned long hash = (unsigned long)sk; hash ^= hash >> 16; hash ^= hash >> 8; hash ^= sk->sk_type; return hash & UNIX_HASH_MOD; } static unsigned int unix_bsd_hash(struct inode *i) { return i->i_ino & UNIX_HASH_MOD; } static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, int addr_len, int type) { __wsum csum = csum_partial(sunaddr, addr_len, 0); unsigned int hash; hash = (__force unsigned int)csum_fold(csum); hash ^= hash >> 8; hash ^= type; return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); } static void unix_table_double_lock(struct net *net, unsigned int hash1, unsigned int hash2) { if (hash1 == hash2) { spin_lock(&net->unx.table.locks[hash1]); return; } if (hash1 > hash2) swap(hash1, hash2); spin_lock(&net->unx.table.locks[hash1]); spin_lock(&net->unx.table.locks[hash2]); } static void unix_table_double_unlock(struct net *net, unsigned int hash1, unsigned int hash2) { if (hash1 == hash2) { spin_unlock(&net->unx.table.locks[hash1]); return; } spin_unlock(&net->unx.table.locks[hash1]); spin_unlock(&net->unx.table.locks[hash2]); } #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { UNIXCB(skb).secid = scm->secid; } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { scm->secid = UNIXCB(skb).secid; } static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) { return (scm->secid == UNIXCB(skb).secid); } #else static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) { return true; } #endif /* CONFIG_SECURITY_NETWORK */ static inline int unix_may_send(struct sock *sk, struct sock *osk) { return !unix_peer(osk) || unix_peer(osk) == sk; } static inline int unix_recvq_full_lockless(const struct sock *sk) { return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; } struct sock *unix_peer_get(struct sock *s) { struct sock *peer; unix_state_lock(s); peer = unix_peer(s); if (peer) sock_hold(peer); unix_state_unlock(s); return peer; } EXPORT_SYMBOL_GPL(unix_peer_get); static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, int addr_len) { struct unix_address *addr; addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); if (!addr) return NULL; refcount_set(&addr->refcnt, 1); addr->len = addr_len; memcpy(addr->name, sunaddr, addr_len); return addr; } static inline void unix_release_addr(struct unix_address *addr) { if (refcount_dec_and_test(&addr->refcnt)) kfree(addr); } /* * Check unix socket name: * - should be not zero length. * - if started by not zero, should be NULL terminated (FS object) * - if started by zero, it is abstract name. */ static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) { if (addr_len <= offsetof(struct sockaddr_un, sun_path) || addr_len > sizeof(*sunaddr)) return -EINVAL; if (sunaddr->sun_family != AF_UNIX) return -EINVAL; return 0; } static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) { struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; short offset = offsetof(struct sockaddr_storage, __data); BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); /* This may look like an off by one error but it is a bit more * subtle. 108 is the longest valid AF_UNIX path for a binding. * sun_path[108] doesn't as such exist. However in kernel space * we are guaranteed that it is a valid memory location in our * kernel address buffer because syscall functions always pass * a pointer of struct sockaddr_storage which has a bigger buffer * than 108. Also, we must terminate sun_path for strlen() in * getname_kernel(). */ addr->__data[addr_len - offset] = 0; /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() * know the actual buffer. */ return strlen(addr->__data) + offset + 1; } static void __unix_remove_socket(struct sock *sk) { sk_del_node_init(sk); } static void __unix_insert_socket(struct net *net, struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); } static void __unix_set_addr_hash(struct net *net, struct sock *sk, struct unix_address *addr, unsigned int hash) { __unix_remove_socket(sk); smp_store_release(&unix_sk(sk)->addr, addr); sk->sk_hash = hash; __unix_insert_socket(net, sk); } static void unix_remove_socket(struct net *net, struct sock *sk) { spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_remove_socket(sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); } static void unix_insert_unbound_socket(struct net *net, struct sock *sk) { spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_insert_socket(net, sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); } static void unix_insert_bsd_socket(struct sock *sk) { spin_lock(&bsd_socket_locks[sk->sk_hash]); sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); spin_unlock(&bsd_socket_locks[sk->sk_hash]); } static void unix_remove_bsd_socket(struct sock *sk) { if (!hlist_unhashed(&sk->sk_bind_node)) { spin_lock(&bsd_socket_locks[sk->sk_hash]); __sk_del_bind_node(sk); spin_unlock(&bsd_socket_locks[sk->sk_hash]); sk_node_init(&sk->sk_bind_node); } } static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) { struct sock *s; sk_for_each(s, &net->unx.table.buckets[hash]) { struct unix_sock *u = unix_sk(s); if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) return s; } return NULL; } static inline struct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) { struct sock *s; spin_lock(&net->unx.table.locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); spin_unlock(&net->unx.table.locks[hash]); return s; } static struct sock *unix_find_socket_byinode(struct inode *i) { unsigned int hash = unix_bsd_hash(i); struct sock *s; spin_lock(&bsd_socket_locks[hash]); sk_for_each_bound(s, &bsd_socket_buckets[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); spin_unlock(&bsd_socket_locks[hash]); return s; } } spin_unlock(&bsd_socket_locks[hash]); return NULL; } /* Support code for asymmetrically connected dgram sockets * * If a datagram socket is connected to a socket not itself connected * to the first socket (eg, /dev/log), clients may only enqueue more * messages if the present receive queue of the server socket is not * "too large". This means there's a second writeability condition * poll and sendmsg need to test. The dgram recv code will do a wake * up on the peer_wait wait queue of a socket upon reception of a * datagram which needs to be propagated to sleeping would-be writers * since these might not have sent anything so far. This can't be * accomplished via poll_wait because the lifetime of the server * socket might be less than that of its clients if these break their * association with it or if the server socket is closed while clients * are still connected to it and there's no way to inform "a polling * implementation" that it should let go of a certain wait queue * * In order to propagate a wake up, a wait_queue_entry_t of the client * socket is enqueued on the peer_wait queue of the server socket * whose wake function does a wake_up on the ordinary client socket * wait queue. This connection is established whenever a write (or * poll for write) hit the flow control condition and broken when the * association to the server socket is dissolved or after a wake up * was relayed. */ static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, void *key) { struct unix_sock *u; wait_queue_head_t *u_sleep; u = container_of(q, struct unix_sock, peer_wake); __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, q); u->peer_wake.private = NULL; /* relaying can only happen while the wq still exists */ u_sleep = sk_sleep(&u->sk); if (u_sleep) wake_up_interruptible_poll(u_sleep, key_to_poll(key)); return 0; } static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) { struct unix_sock *u, *u_other; int rc; u = unix_sk(sk); u_other = unix_sk(other); rc = 0; spin_lock(&u_other->peer_wait.lock); if (!u->peer_wake.private) { u->peer_wake.private = other; __add_wait_queue(&u_other->peer_wait, &u->peer_wake); rc = 1; } spin_unlock(&u_other->peer_wait.lock); return rc; } static void unix_dgram_peer_wake_disconnect(struct sock *sk, struct sock *other) { struct unix_sock *u, *u_other; u = unix_sk(sk); u_other = unix_sk(other); spin_lock(&u_other->peer_wait.lock); if (u->peer_wake.private == other) { __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); u->peer_wake.private = NULL; } spin_unlock(&u_other->peer_wait.lock); } static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, struct sock *other) { unix_dgram_peer_wake_disconnect(sk, other); wake_up_interruptible_poll(sk_sleep(sk), EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); } /* preconditions: * - unix_peer(sk) == other * - association is stable */ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) { int connected; connected = unix_dgram_peer_wake_connect(sk, other); /* If other is SOCK_DEAD, we want to make sure we signal * POLLOUT, such that a subsequent write() can get a * -ECONNREFUSED. Otherwise, if we haven't queued any skbs * to other and its full, we will hang waiting for POLLOUT. */ if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) return 1; if (connected) unix_dgram_peer_wake_disconnect(sk, other); return 0; } static int unix_writable(const struct sock *sk, unsigned char state) { return state != TCP_LISTEN && (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); } static void unix_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); if (unix_writable(sk, READ_ONCE(sk->sk_state))) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* When dgram socket disconnects (or changes its peer), we clear its receive * queue of packets arrived from previous peer. First, it allows to do * flow control based only on wmem_alloc; second, sk connected to peer * may receive messages only from that peer. */ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) { if (!skb_queue_empty(&sk->sk_receive_queue)) { skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_UNIX_DISCONNECT); wake_up_interruptible_all(&unix_sk(sk)->peer_wait); /* If one link of bidirectional dgram pipe is disconnected, * we signal error. Messages are lost. Do not make this, * when peer was not connected to us. */ if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { WRITE_ONCE(other->sk_err, ECONNRESET); sk_error_report(other); } } } static void unix_sock_destructor(struct sock *sk) { struct unix_sock *u = unix_sk(sk); skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { pr_info("Attempt to release alive unix socket: %p\n", sk); return; } if (u->addr) unix_release_addr(u->addr); atomic_long_dec(&unix_nr_socks); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); #ifdef UNIX_REFCNT_DEBUG pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, atomic_long_read(&unix_nr_socks)); #endif } static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); struct sock *skpair; struct sk_buff *skb; struct path path; int state; unix_remove_socket(sock_net(sk), sk); unix_remove_bsd_socket(sk); /* Clear state */ unix_state_lock(sk); sock_orphan(sk); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); path = u->path; u->path.dentry = NULL; u->path.mnt = NULL; state = sk->sk_state; WRITE_ONCE(sk->sk_state, TCP_CLOSE); skpair = unix_peer(sk); unix_peer(sk) = NULL; unix_state_unlock(sk); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) u->oob_skb = NULL; #endif wake_up_interruptible_all(&u->peer_wait); if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); } unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ } /* Try to flush out this socket. Throw out buffers at least */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); /* passed fds are erased in the kfree_skb hook */ kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); } if (path.dentry) path_put(&path); sock_put(sk); /* ---- Socket is dead now and most probably destroyed ---- */ /* * Fixme: BSD difference: In BSD all sockets connected to us get * ECONNRESET and we die on the spot. In Linux we behave * like files and pipes do and wait for the last * dereference. * * Can't we simply set sock->err? * * What the above comment does talk about? --ANK(980817) */ if (READ_ONCE(unix_tot_inflight)) unix_gc(); /* Garbage collect fds */ } static void init_peercred(struct sock *sk) { sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); } static void update_peercred(struct sock *sk) { const struct cred *old_cred; struct pid *old_pid; spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; init_peercred(sk); spin_unlock(&sk->sk_peer_lock); put_pid(old_pid); put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { lockdep_assert_held(&unix_sk(peersk)->lock); spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); } static int unix_listen(struct socket *sock, int backlog) { int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; /* Only stream/seqpacket sockets accept */ err = -EINVAL; if (!READ_ONCE(u->addr)) goto out; /* No listens on an unbound socket */ unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) goto out_unlock; if (backlog > sk->sk_max_ack_backlog) wake_up_interruptible_all(&u->peer_wait); sk->sk_max_ack_backlog = backlog; WRITE_ONCE(sk->sk_state, TCP_LISTEN); /* set credentials so connect can copy them */ update_peercred(sk); err = 0; out_unlock: unix_state_unlock(sk); out: return err; } static int unix_release(struct socket *); static int unix_bind(struct socket *, struct sockaddr *, int); static int unix_stream_connect(struct socket *, struct sockaddr *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); static int unix_getname(struct socket *, struct sockaddr *, int); static __poll_t unix_poll(struct file *, struct socket *, poll_table *); static __poll_t unix_dgram_poll(struct file *, struct socket *, poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); #endif static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, struct pipe_inode_info *, size_t size, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, int); #ifdef CONFIG_PROC_FS static int unix_count_nr_fds(struct sock *sk) { struct sk_buff *skb; struct unix_sock *u; int nr_fds = 0; spin_lock(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); while (skb) { u = unix_sk(skb->sk); nr_fds += atomic_read(&u->scm_stat.nr_fds); skb = skb_peek_next(skb, &sk->sk_receive_queue); } spin_unlock(&sk->sk_receive_queue.lock); return nr_fds; } static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; unsigned char s_state; struct unix_sock *u; int nr_fds = 0; if (sk) { s_state = READ_ONCE(sk->sk_state); u = unix_sk(sk); /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. * SOCK_DGRAM is ordinary. So, no lock is needed. */ if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) nr_fds = atomic_read(&u->scm_stat.nr_fds); else if (s_state == TCP_LISTEN) nr_fds = unix_count_nr_fds(sk); seq_printf(m, "scm_fds: %u\n", nr_fds); } } #else #define unix_show_fdinfo NULL #endif static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_stream_connect, .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, .poll = unix_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = unix_listen, .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .read_skb = unix_stream_read_skb, .mmap = sock_no_mmap, .splice_read = unix_stream_splice_read, .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_dgram_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_dgram_connect, .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, .read_skb = unix_read_skb, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_seqpacket_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_stream_connect, .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = unix_listen, .shutdown = unix_shutdown, .sendmsg = unix_seqpacket_sendmsg, .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static void unix_close(struct sock *sk, long timeout) { /* Nothing to do here, unix socket does not need a ->close(). * This is merely for sockmap. */ } static void unix_unhash(struct sock *sk) { /* Nothing to do here, unix socket does not need a ->unhash(). * This is merely for sockmap. */ } static bool unix_bpf_bypass_getsockopt(int level, int optname) { if (level == SOL_SOCKET) { switch (optname) { case SO_PEERPIDFD: return true; default: return false; } } return false; } struct proto unix_dgram_proto = { .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_dgram_bpf_update_proto, #endif }; struct proto unix_stream_proto = { .name = "UNIX-STREAM", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, .unhash = unix_unhash, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, #endif }; static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { struct unix_sock *u; struct sock *sk; int err; atomic_long_inc(&unix_nr_socks); if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { err = -ENFILE; goto err; } if (type == SOCK_STREAM) sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); else /*dgram and seqpacket */ sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); if (!sk) { err = -ENOMEM; goto err; } sock_init_data(sock, sk); sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_destruct = unix_sock_destructor; lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); u = unix_sk(sk); u->listener = NULL; u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_unbound_socket(net, sk); sock_prot_inuse_add(net, sk->sk_prot, 1); return sk; err: atomic_long_dec(&unix_nr_socks); return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; sock->state = SS_UNCONNECTED; switch (sock->type) { case SOCK_STREAM: sock->ops = &unix_stream_ops; break; /* * Believe it or not BSD has AF_UNIX, SOCK_RAW though * nothing uses it. */ case SOCK_RAW: sock->type = SOCK_DGRAM; fallthrough; case SOCK_DGRAM: sock->ops = &unix_dgram_ops; break; case SOCK_SEQPACKET: sock->ops = &unix_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } sk = unix_create1(net, sock, kern, sock->type); if (IS_ERR(sk)) return PTR_ERR(sk); return 0; } static int unix_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; sk->sk_prot->close(sk, 0); unix_release_sock(sk, 0); sock->sk = NULL; return 0; } static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, int type) { struct inode *inode; struct path path; struct sock *sk; int err; unix_mkname_bsd(sunaddr, addr_len); err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; err = path_permission(&path, MAY_WRITE); if (err) goto path_put; err = -ECONNREFUSED; inode = d_backing_inode(path.dentry); if (!S_ISSOCK(inode->i_mode)) goto path_put; sk = unix_find_socket_byinode(inode); if (!sk) goto path_put; err = -EPROTOTYPE; if (sk->sk_type == type) touch_atime(&path); else goto sock_put; path_put(&path); return sk; sock_put: sock_put(sk); path_put: path_put(&path); fail: return ERR_PTR(err); } static struct sock *unix_find_abstract(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type) { unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); struct dentry *dentry; struct sock *sk; sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); if (!sk) return ERR_PTR(-ECONNREFUSED); dentry = unix_sk(sk)->path.dentry; if (dentry) touch_atime(&unix_sk(sk)->path); return sk; } static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type) { struct sock *sk; if (sunaddr->sun_path[0]) sk = unix_find_bsd(sunaddr, addr_len, type); else sk = unix_find_abstract(net, sunaddr, addr_len, type); return sk; } static int unix_autobind(struct sock *sk) { struct unix_sock *u = unix_sk(sk); unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; u32 lastnum, ordernum; int err; err = mutex_lock_interruptible(&u->bindlock); if (err) return err; if (u->addr) goto out; err = -ENOMEM; addr = kzalloc(sizeof(*addr) + offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); if (!addr) goto out; addr->len = offsetof(struct sockaddr_un, sun_path) + 6; addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); old_hash = sk->sk_hash; ordernum = get_random_u32(); lastnum = ordernum & 0xFFFFF; retry: ordernum = (ordernum + 1) & 0xFFFFF; sprintf(addr->name->sun_path + 1, "%05x", ordernum); new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { unix_table_double_unlock(net, old_hash, new_hash); /* __unix_find_socket_byname() may take long time if many names * are already in use. */ cond_resched(); if (ordernum == lastnum) { /* Give up if all names seems to be in use. */ err = -ENOSPC; unix_release_addr(addr); goto out; } goto retry; } __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); err = 0; out: mutex_unlock(&u->bindlock); return err; } static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); struct unix_sock *u = unix_sk(sk); unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct mnt_idmap *idmap; struct unix_address *addr; struct dentry *dentry; struct path parent; int err; addr_len = unix_mkname_bsd(sunaddr, addr_len); addr = unix_create_addr(sunaddr, addr_len); if (!addr) return -ENOMEM; /* * Get the parent directory, calculate the hash for last * component. */ dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; } /* * All right, let's create it. */ idmap = mnt_idmap(parent.mnt); err = security_path_mknod(&parent, dentry, mode, 0); if (!err) err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); if (err) goto out_path; err = mutex_lock_interruptible(&u->bindlock); if (err) goto out_unlink; if (u->addr) goto out_unlock; old_hash = sk->sk_hash; new_hash = unix_bsd_hash(d_backing_inode(dentry)); unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; out_unlock: mutex_unlock(&u->bindlock); err = -EINVAL; out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: done_path_create(&parent, dentry); out: unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; } static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { struct unix_sock *u = unix_sk(sk); unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; int err; addr = unix_create_addr(sunaddr, addr_len); if (!addr) return -ENOMEM; err = mutex_lock_interruptible(&u->bindlock); if (err) goto out; if (u->addr) { err = -EINVAL; goto out_mutex; } old_hash = sk->sk_hash; new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) goto out_spin; __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); return 0; out_spin: unix_table_double_unlock(net, old_hash, new_hash); err = -EADDRINUSE; out_mutex: mutex_unlock(&u->bindlock); out: unix_release_addr(addr); return err; } static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; int err; if (addr_len == offsetof(struct sockaddr_un, sun_path) && sunaddr->sun_family == AF_UNIX) return unix_autobind(sk); err = unix_validate_addr(sunaddr, addr_len); if (err) return err; if (sunaddr->sun_path[0]) err = unix_bind_bsd(sk, sunaddr, addr_len); else err = unix_bind_abstract(sk, sunaddr, addr_len); return err; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) { if (unlikely(sk1 == sk2) || !sk2) { unix_state_lock(sk1); return; } if (sk1 > sk2) swap(sk1, sk2); unix_state_lock(sk1); unix_state_lock(sk2); } static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) { if (unlikely(sk1 == sk2) || !sk2) { unix_state_unlock(sk1); return; } unix_state_unlock(sk1); unix_state_unlock(sk2); } static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; struct sock *sk = sock->sk; struct sock *other; int err; err = -EINVAL; if (alen < offsetofend(struct sockaddr, sa_family)) goto out; if (addr->sa_family != AF_UNSPEC) { err = unix_validate_addr(sunaddr, alen); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); if (err) goto out; if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !READ_ONCE(unix_sk(sk)->addr)) { err = unix_autobind(sk); if (err) goto out; } restart: other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; } unix_state_double_lock(sk, other); /* Apparently VFS overslept socket death. Retry. */ if (sock_flag(other, SOCK_DEAD)) { unix_state_double_unlock(sk, other); sock_put(other); goto restart; } err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock; WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); } else { /* * 1003.1g breaking connected state with AF_UNSPEC */ other = NULL; unix_state_double_lock(sk, other); } /* * If it was connected, reconnect. */ if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); unix_peer(sk) = other; if (!other) WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); unix_state_double_unlock(sk, other); if (other != old_peer) { unix_dgram_disconnected(sk, old_peer); unix_state_lock(old_peer); if (!unix_peer(old_peer)) WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); unix_state_unlock(old_peer); } sock_put(old_peer); } else { unix_peer(sk) = other; unix_state_double_unlock(sk, other); } return 0; out_unlock: unix_state_double_unlock(sk, other); sock_put(other); out: return err; } static long unix_wait_for_peer(struct sock *other, long timeo) __releases(&unix_sk(other)->lock) { struct unix_sock *u = unix_sk(other); int sched; DEFINE_WAIT(wait); prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); sched = !sock_flag(other, SOCK_DEAD) && !(other->sk_shutdown & RCV_SHUTDOWN) && unix_recvq_full_lockless(other); unix_state_unlock(other); if (sched) timeo = schedule_timeout(timeo); finish_wait(&u->peer_wait, &wait); return timeo; } static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; unsigned char state; long timeo; int err; err = unix_validate_addr(sunaddr, addr_len); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); if (err) goto out; if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); /* First of all allocate resources. * If we will make it after state is locked, * we will have to recheck all again in any case. */ /* create new sock for complete connection */ newsk = unix_create1(net, NULL, 0, sock->type); if (IS_ERR(newsk)) { err = PTR_ERR(newsk); goto out; } /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (!skb) { err = -ENOMEM; goto out_free_sk; } restart: /* Find listening sock. */ other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); goto out_free_skb; } unix_state_lock(other); /* Apparently VFS overslept socket death. Retry. */ if (sock_flag(other, SOCK_DEAD)) { unix_state_unlock(other); sock_put(other); goto restart; } if (other->sk_state != TCP_LISTEN || other->sk_shutdown & RCV_SHUTDOWN) { err = -ECONNREFUSED; goto out_unlock; } if (unix_recvq_full_lockless(other)) { if (!timeo) { err = -EAGAIN; goto out_unlock; } timeo = unix_wait_for_peer(other, timeo); sock_put(other); err = sock_intr_errno(timeo); if (signal_pending(current)) goto out_free_skb; goto restart; } /* self connect and simultaneous connect are eliminated * by rejecting TCP_LISTEN socket to avoid deadlock. */ state = READ_ONCE(sk->sk_state); if (unlikely(state != TCP_CLOSE)) { err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; goto out_unlock; } unix_state_lock(sk); if (unlikely(sk->sk_state != TCP_CLOSE)) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; unix_state_unlock(sk); goto out_unlock; } err = security_unix_stream_connect(sk, other, newsk); if (err) { unix_state_unlock(sk); goto out_unlock; } /* The way is open! Fastly set all the necessary fields... */ sock_hold(sk); unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); /* copy address information from listening to new sock * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found * otheru in hash under its lock. Insertion into the * hash chain we'd found it in had been done in an * earlier critical area protected by the chain's lock, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * * Using smp_store_release() here to set newu->addr * is enough to make those stores, as well as stores * to newu->path visible to anyone who gets newu->addr * by smp_load_acquire(). IOW, the same warranties * as for unix_sock instances bound in unix_bind() or * in unix_autobind(). */ if (otheru->path.dentry) { path_get(&otheru->path); newu->path = otheru->path; } refcount_inc(&otheru->addr->refcnt); smp_store_release(&newu->addr, otheru->addr); /* Set credentials */ copy_peercred(sk, other); sock->state = SS_CONNECTED; WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); sock_hold(newsk); smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ unix_peer(sk) = newsk; unix_state_unlock(sk); /* take ten and send info to listening sock */ spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); return 0; out_unlock: unix_state_unlock(other); sock_put(other); out_free_skb: consume_skb(skb); out_free_sk: unix_release_sock(newsk, 0); out: return err; } static int unix_socketpair(struct socket *socka, struct socket *sockb) { struct sock *ska = socka->sk, *skb = sockb->sk; /* Join our sockets back to back */ sock_hold(ska); sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; init_peercred(ska); init_peercred(skb); ska->sk_state = TCP_ESTABLISHED; skb->sk_state = TCP_ESTABLISHED; socka->state = SS_CONNECTED; sockb->state = SS_CONNECTED; return 0; } static void unix_sock_inherit_flags(const struct socket *old, struct socket *new) { if (test_bit(SOCK_PASSCRED, &old->flags)) set_bit(SOCK_PASSCRED, &new->flags); if (test_bit(SOCK_PASSPIDFD, &old->flags)) set_bit(SOCK_PASSPIDFD, &new->flags); if (test_bit(SOCK_PASSSEC, &old->flags)) set_bit(SOCK_PASSSEC, &new->flags); } static int unix_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sk_buff *skb; struct sock *tsk; arg->err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; arg->err = -EINVAL; if (READ_ONCE(sk->sk_state) != TCP_LISTEN) goto out; /* If socket state is TCP_LISTEN it cannot change (for now...), * so that no locks are necessary. */ skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, &arg->err); if (!skb) { /* This means receive shutdown. */ if (arg->err == 0) arg->err = -EINVAL; goto out; } tsk = skb->sk; skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); /* attach accepted sock to socket */ unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; out: return arg->err; } static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct unix_address *addr; DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); int err = 0; if (peer) { sk = unix_peer_get(sk); err = -ENOTCONN; if (!sk) goto out; err = 0; } else { sock_hold(sk); } addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; err = offsetof(struct sockaddr_un, sun_path); } else { err = addr->len; memcpy(sunaddr, addr->name, addr->len); if (peer) BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, CGROUP_UNIX_GETPEERNAME); else BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, CGROUP_UNIX_GETSOCKNAME); } sock_put(sk); out: return err; } /* The "user->unix_inflight" variable is protected by the garbage * collection lock, and we just read it locklessly here. If you go * over the limit, there might be a tiny race in actually noticing * it across threads. Tough. */ static inline bool too_many_unix_fds(struct task_struct *p) { struct user_struct *user = current_user(); if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); return false; } static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { if (too_many_unix_fds(current)) return -ETOOMANYREFS; UNIXCB(skb).fp = scm->fp; scm->fp = NULL; if (unix_prepare_fpl(UNIXCB(skb).fp)) return -ENOMEM; return 0; } static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; unix_destroy_fpl(scm->fp); } static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); } static void unix_destruct_scm(struct sk_buff *skb) { struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); scm.pid = UNIXCB(skb).pid; if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); /* Alas, it calls VFS */ /* So fscking what? fput() had been SMP-safe since the last Summer */ scm_destroy(&scm); sock_wfree(skb); } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; UNIXCB(skb).pid = get_pid(scm->pid); UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; unix_get_secdata(scm, skb); if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); skb->destructor = unix_destruct_scm; return err; } static bool unix_passcred_enabled(const struct socket *sock, const struct sock *other) { return test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags) || !other->sk_socket || test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); } /* * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. */ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, const struct sock *other) { if (UNIXCB(skb).pid) return; if (unix_passcred_enabled(sock, other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } } static bool unix_skb_scm_eq(struct sk_buff *skb, struct scm_cookie *scm) { return UNIXCB(skb).pid == scm->pid && uid_eq(UNIXCB(skb).uid, scm->creds.uid) && gid_eq(UNIXCB(skb).gid, scm->creds.gid) && unix_secdata_eq(scm, skb); } static void scm_stat_add(struct sock *sk, struct sk_buff *skb) { struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); unix_add_edges(fp, u); } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) { struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); unix_del_edges(fp); } } /* * Send AF_UNIX data. */ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk, *other = NULL; struct unix_sock *u = unix_sk(sk); struct scm_cookie scm; struct sk_buff *skb; int data_len = 0; int sk_locked; long timeo; int err; err = scm_send(sock, msg, &scm, false); if (err < 0) return err; wait_for_unix_gc(scm.fp); if (msg->msg_flags & MSG_OOB) { err = -EOPNOTSUPP; goto out; } if (msg->msg_namelen) { err = unix_validate_addr(msg->msg_name, msg->msg_namelen); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, msg->msg_name, &msg->msg_namelen, NULL); if (err) goto out; } if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; } if (len > READ_ONCE(sk->sk_sndbuf) - 32) { err = -EMSGSIZE; goto out; } if (len > SKB_MAX_ALLOC) { data_len = min_t(size_t, len - SKB_MAX_ALLOC, MAX_SKB_FRAGS * PAGE_SIZE); data_len = PAGE_ALIGN(data_len); BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); } skb = sock_alloc_send_pskb(sk, len - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) goto out; err = unix_scm_to_skb(&scm, skb, true); if (err < 0) goto out_free; skb_put(skb, len - data_len); skb->data_len = data_len; skb->len = len; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); if (err) goto out_free; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (msg->msg_namelen) { lookup: other = unix_find_other(sock_net(sk), msg->msg_name, msg->msg_namelen, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); goto out_free; } } else { other = unix_peer_get(sk); if (!other) { err = -ENOTCONN; goto out_free; } } if (sk_filter(other, skb) < 0) { /* Toss the packet but do not return any error to the sender */ err = len; goto out_sock_put; } restart: sk_locked = 0; unix_state_lock(other); restart_locked: if (!unix_may_send(sk, other)) { err = -EPERM; goto out_unlock; } if (unlikely(sock_flag(other, SOCK_DEAD))) { /* Check with 1003.1g - what should datagram error */ unix_state_unlock(other); if (sk->sk_type == SOCK_SEQPACKET) { /* We are here only when racing with unix_release_sock() * is clearing @other. Never change state to TCP_CLOSE * unlike SOCK_DGRAM wants. */ err = -EPIPE; goto out_sock_put; } if (!sk_locked) unix_state_lock(sk); if (unix_peer(sk) == other) { unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_state_unlock(sk); unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; goto out_sock_put; } unix_state_unlock(sk); if (!msg->msg_namelen) { err = -ECONNRESET; goto out_sock_put; } goto lookup; } if (other->sk_shutdown & RCV_SHUTDOWN) { err = -EPIPE; goto out_unlock; } if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock; } /* other == sk && unix_peer(other) != sk if * - unix_peer(sk) == NULL, destination address bound to sk * - unix_peer(sk) == sk by time of get but disconnected before lock */ if (other != sk && unlikely(unix_peer(other) != sk && unix_recvq_full_lockless(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); err = sock_intr_errno(timeo); if (signal_pending(current)) goto out_sock_put; goto restart; } if (!sk_locked) { unix_state_unlock(other); unix_state_double_lock(sk, other); } if (unix_peer(sk) != other || unix_dgram_peer_wake_me(sk, other)) { err = -EAGAIN; sk_locked = 1; goto out_unlock; } if (!sk_locked) { sk_locked = 1; goto restart_locked; } } if (unlikely(sk_locked)) unix_state_unlock(sk); if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); scm_destroy(&scm); return len; out_unlock: if (sk_locked) unix_state_unlock(sk); unix_state_unlock(other); out_sock_put: sock_put(other); out_free: consume_skb(skb); out: scm_destroy(&scm); return err; } /* We use paged skbs for stream sockets, and limit occupancy to 32768 * bytes, and a minimum of a full page. */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, struct scm_cookie *scm, bool fds_sent) { struct unix_sock *ousk = unix_sk(other); struct sk_buff *skb; int err; skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) return err; err = unix_scm_to_skb(scm, skb, !fds_sent); if (err < 0) goto out; skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); if (err) goto out; unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { unix_state_unlock(other); err = -EPIPE; goto out; } maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); WRITE_ONCE(ousk->oob_skb, skb); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); sk_send_sigurg(other); unix_state_unlock(other); other->sk_data_ready(other); return 0; out: consume_skb(skb); return err; } #endif static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb = NULL; struct sock *other = NULL; struct scm_cookie scm; bool fds_sent = false; int err, sent = 0; err = scm_send(sock, msg, &scm, false); if (err < 0) return err; wait_for_unix_gc(scm.fp); if (msg->msg_flags & MSG_OOB) { err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (len) len--; else #endif goto out_err; } if (msg->msg_namelen) { err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } else { other = unix_peer(sk); if (!other) { err = -ENOTCONN; goto out_err; } } if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto out_pipe; while (sent < len) { int size = len - sent; int data_len; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb = sock_alloc_send_pskb(sk, 0, 0, msg->msg_flags & MSG_DONTWAIT, &err, 0); } else { /* Keep two messages in the pipe so it schedules better */ size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); /* allow fallback to order-0 allocations */ size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); skb = sock_alloc_send_pskb(sk, size - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, get_order(UNIX_SKB_FRAGS_SZ)); } if (!skb) goto out_err; /* Only send the fds in the first buffer */ err = unix_scm_to_skb(&scm, skb, !fds_sent); if (err < 0) goto out_free; fds_sent = true; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb->ip_summed = CHECKSUM_UNNECESSARY; err = skb_splice_from_iter(skb, &msg->msg_iter, size, sk->sk_allocation); if (err < 0) goto out_free; size = err; refcount_add(size, &sk->sk_wmem_alloc); } else { skb_put(skb, size - data_len); skb->data_len = data_len; skb->len = size; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) goto out_free; } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) goto out_pipe_unlock; maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other); sent += size; } #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (msg->msg_flags & MSG_OOB) { err = queue_oob(sock, msg, other, &scm, fds_sent); if (err) goto out_err; sent++; } #endif scm_destroy(&scm); return sent; out_pipe_unlock: unix_state_unlock(other); out_pipe: if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; out_free: consume_skb(skb); out_err: scm_destroy(&scm); return sent ? : err; } static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { int err; struct sock *sk = sock->sk; err = sock_error(sk); if (err) return err; if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; if (msg->msg_namelen) msg->msg_namelen = 0; return unix_dgram_sendmsg(sock, msg, len); } static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; return unix_dgram_recvmsg(sock, msg, size, flags); } static void unix_copy_addr(struct msghdr *msg, struct sock *sk) { struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (addr) { msg->msg_namelen = addr->len; memcpy(msg->msg_name, addr->name, addr->len); } } int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags) { struct scm_cookie scm; struct socket *sock = sk->sk_socket; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb, *last; long timeo; int skip; int err; err = -EOPNOTSUPP; if (flags&MSG_OOB) goto out; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, &skip, &err, &last); if (skb) { if (!(flags & MSG_PEEK)) scm_stat_del(sk, skb); break; } mutex_unlock(&u->iolock); if (err != -EAGAIN) break; } while (timeo && !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && (sk->sk_shutdown & RCV_SHUTDOWN)) err = 0; unix_state_unlock(sk); goto out; } if (wq_has_sleeper(&u->peer_wait)) wake_up_interruptible_sync_poll(&u->peer_wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); if (msg->msg_name) { unix_copy_addr(msg, skb->sk); BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name, &msg->msg_namelen); } if (size > skb->len - skip) size = skb->len - skip; else if (size < skb->len - skip) msg->msg_flags |= MSG_TRUNC; err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free; if (sock_flag(sk, SOCK_RCVTSTAMP)) __sock_recv_timestamp(msg, sk, skb); memset(&scm, 0, sizeof(scm)); scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); sk_peek_offset_bwd(sk, skb->len); } else { /* It is questionable: on PEEK we could: - do not return fds - good, but too simple 8) - return fds, and do not return them on read (old strategy, apparently wrong) - clone fds (I chose it for now, it is the most universal solution) POSIX 1003.1g does not actually define this clearly at all. POSIX 1003.1g doesn't define a lot of things clearly however! */ sk_peek_offset_fwd(sk, size); if (UNIXCB(skb).fp) unix_peek_fds(&scm, skb); } err = (flags & MSG_TRUNC) ? skb->len - skip : size; scm_recv_unix(sock, msg, &scm, flags); out_free: skb_free_datagram(sk, skb); mutex_unlock(&u->iolock); out: return err; } static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; #ifdef CONFIG_BPF_SYSCALL const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_dgram_proto) return prot->recvmsg(sk, msg, size, flags, NULL); #endif return __unix_dgram_recvmsg(sk, msg, size, flags); } static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int err; mutex_lock(&u->iolock); skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); mutex_unlock(&u->iolock); if (!skb) return err; return recv_actor(sk, skb); } /* * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, struct sk_buff *last, unsigned int last_len, bool freezable) { unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, state); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || (tail && tail->len != last_len) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || !timeo) break; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); timeo = schedule_timeout(timeo); unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) break; sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); unix_state_unlock(sk); return timeo; } static unsigned int unix_skb_len(const struct sk_buff *skb) { return skb->len - UNIXCB(skb).consumed; } struct unix_stream_read_state { int (*recv_actor)(struct sk_buff *, int, int, struct unix_stream_read_state *); struct socket *socket; struct msghdr *msg; struct pipe_inode_info *pipe; size_t size; int flags; unsigned int splice_flags; }; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int unix_stream_recv_urg(struct unix_stream_read_state *state) { struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int chunk = 1; struct sk_buff *oob_skb; mutex_lock(&u->iolock); unix_state_lock(sk); spin_lock(&sk->sk_receive_queue.lock); if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); mutex_unlock(&u->iolock); return -EINVAL; } oob_skb = u->oob_skb; if (!(state->flags & MSG_PEEK)) WRITE_ONCE(u->oob_skb, NULL); spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); chunk = state->recv_actor(oob_skb, 0, chunk, state); if (!(state->flags & MSG_PEEK)) UNIXCB(oob_skb).consumed += 1; mutex_unlock(&u->iolock); if (chunk < 0) return -EFAULT; state->msg->msg_flags |= MSG_OOB; return 1; } static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, int flags, int copied) { struct sk_buff *read_skb = NULL, *unread_skb = NULL; struct unix_sock *u = unix_sk(sk); if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) return skb; spin_lock(&sk->sk_receive_queue.lock); if (!unix_skb_len(skb)) { if (copied && (!u->oob_skb || skb == u->oob_skb)) { skb = NULL; } else if (flags & MSG_PEEK) { skb = skb_peek_next(skb, &sk->sk_receive_queue); } else { read_skb = skb; skb = skb_peek_next(skb, &sk->sk_receive_queue); __skb_unlink(read_skb, &sk->sk_receive_queue); } if (!skb) goto unlock; } if (skb != u->oob_skb) goto unlock; if (copied) { skb = NULL; } else if (!(flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); if (!sock_flag(sk, SOCK_URGINLINE)) { __skb_unlink(skb, &sk->sk_receive_queue); unread_skb = skb; skb = skb_peek(&sk->sk_receive_queue); } } else if (!sock_flag(sk, SOCK_URGINLINE)) { skb = skb_peek_next(skb, &sk->sk_receive_queue); } unlock: spin_unlock(&sk->sk_receive_queue.lock); consume_skb(read_skb); kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return skb; } #endif static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int err; if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) return -ENOTCONN; mutex_lock(&u->iolock); skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); mutex_unlock(&u->iolock); if (!skb) return err; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (unlikely(skb == READ_ONCE(u->oob_skb))) { bool drop = false; unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { unix_state_unlock(sk); kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); return -ECONNRESET; } spin_lock(&sk->sk_receive_queue.lock); if (likely(skb == u->oob_skb)) { WRITE_ONCE(u->oob_skb, NULL); drop = true; } spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); if (drop) { kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return -EAGAIN; } } #endif return recv_actor(sk, skb); } static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { struct scm_cookie scm; struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int copied = 0; int flags = state->flags; int noblock = flags & MSG_DONTWAIT; bool check_creds = false; int target; int err = 0; long timeo; int skip; size_t size = state->size; unsigned int last_len; if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { err = -EINVAL; goto out; } if (unlikely(flags & MSG_OOB)) { err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) err = unix_stream_recv_urg(state); #endif goto out; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); memset(&scm, 0, sizeof(scm)); /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ mutex_lock(&u->iolock); skip = max(sk_peek_offset(sk, flags), 0); do { struct sk_buff *skb, *last; int chunk; redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; goto unlock; } last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); if (!skb && copied) { unix_state_unlock(sk); break; } } #endif if (skb == NULL) { if (copied >= target) goto unlock; /* * POSIX 1003.1g mandates this order. */ err = sock_error(sk); if (err) goto unlock; if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; unix_state_unlock(sk); if (!timeo) { err = -EAGAIN; break; } mutex_unlock(&u->iolock); timeo = unix_stream_data_wait(sk, timeo, last, last_len, freezable); if (signal_pending(current)) { err = sock_intr_errno(timeo); scm_destroy(&scm); goto out; } mutex_lock(&u->iolock); goto redo; unlock: unix_state_unlock(sk); break; } while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; last_len = skb->len; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; } unix_state_unlock(sk); if (check_creds) { /* Never glue messages from different writers */ if (!unix_skb_scm_eq(skb, &scm)) break; } else if (test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); check_creds = true; } /* Copy address just once */ if (state->msg && state->msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, state->msg->msg_name); unix_copy_addr(state->msg, skb->sk); BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, state->msg->msg_name, &state->msg->msg_namelen); sunaddr = NULL; } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); chunk = state->recv_actor(skb, skip, chunk, state); if (chunk < 0) { if (copied == 0) copied = -EFAULT; break; } copied += chunk; size -= chunk; /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { UNIXCB(skb).consumed += chunk; sk_peek_offset_bwd(sk, chunk); if (UNIXCB(skb).fp) { scm_stat_del(sk, skb); unix_detach_fds(&scm, skb); } if (unix_skb_len(skb)) break; skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); if (scm.fp) break; } else { /* It is questionable, see note in unix_dgram_recvmsg. */ if (UNIXCB(skb).fp) unix_peek_fds(&scm, skb); sk_peek_offset_fwd(sk, chunk); if (UNIXCB(skb).fp) break; skip = 0; last = skb; last_len = skb->len; unix_state_lock(sk); skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb) goto again; unix_state_unlock(sk); break; } } while (size); mutex_unlock(&u->iolock); if (state->msg) scm_recv_unix(sock, state->msg, &scm, flags); else scm_destroy(&scm); out: return copied ? : err; } static int unix_stream_read_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { int ret; ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, state->msg, chunk); return ret ?: chunk; } int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_read_actor, .socket = sk->sk_socket, .msg = msg, .size = size, .flags = flags }; return unix_stream_read_generic(&state, true); } static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_read_actor, .socket = sock, .msg = msg, .size = size, .flags = flags }; #ifdef CONFIG_BPF_SYSCALL struct sock *sk = sock->sk; const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_stream_proto) return prot->recvmsg(sk, msg, size, flags, NULL); #endif return unix_stream_read_generic(&state, true); } static int unix_stream_splice_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { return skb_splice_bits(skb, state->socket->sk, UNIXCB(skb).consumed + skip, state->pipe, chunk, state->splice_flags); } static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t size, unsigned int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_splice_actor, .socket = sock, .pipe = pipe, .size = size, .splice_flags = flags, }; if (unlikely(*ppos)) return -ESPIPE; if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK) state.flags = MSG_DONTWAIT; return unix_stream_read_generic(&state, false); } static int unix_shutdown(struct socket *sock, int mode) { struct sock *sk = sock->sk; struct sock *other; if (mode < SHUT_RD || mode > SHUT_RDWR) return -EINVAL; /* This maps: * SHUT_RD (0) -> RCV_SHUTDOWN (1) * SHUT_WR (1) -> SEND_SHUTDOWN (2) * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) */ ++mode; unix_state_lock(sk); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); other = unix_peer(sk); if (other) sock_hold(other); unix_state_unlock(sk); sk->sk_state_change(sk); if (other && (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { int peer_mode = 0; const struct proto *prot = READ_ONCE(other->sk_prot); if (prot->unhash) prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) peer_mode |= RCV_SHUTDOWN; unix_state_lock(other); WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); unix_state_unlock(other); other->sk_state_change(other); if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); } if (other) sock_put(other); return 0; } long unix_inq_len(struct sock *sk) { struct sk_buff *skb; long amount = 0; if (READ_ONCE(sk->sk_state) == TCP_LISTEN) return -EINVAL; spin_lock(&sk->sk_receive_queue.lock); if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) amount += unix_skb_len(skb); } else { skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; } spin_unlock(&sk->sk_receive_queue.lock); return amount; } EXPORT_SYMBOL_GPL(unix_inq_len); long unix_outq_len(struct sock *sk) { return sk_wmem_alloc_get(sk); } EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { struct path path; struct file *f; int fd; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!smp_load_acquire(&unix_sk(sk)->addr)) return -ENOENT; path = unix_sk(sk)->path; if (!path.dentry) return -ENOENT; path_get(&path); fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) goto out; f = dentry_open(&path, O_PATH, current_cred()); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); goto out; } fd_install(fd, f); out: path_put(&path); return fd; } static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; long amount = 0; int err; switch (cmd) { case SIOCOUTQ: amount = unix_outq_len(sk); err = put_user(amount, (int __user *)arg); break; case SIOCINQ: amount = unix_inq_len(sk); if (amount < 0) err = amount; else err = put_user(amount, (int __user *)arg); break; case SIOCUNIXFILE: err = unix_open_file(sk); break; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) case SIOCATMARK: { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int answ = 0; mutex_lock(&u->iolock); skb = skb_peek(&sk->sk_receive_queue); if (skb) { struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); struct sk_buff *next_skb; next_skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb == oob_skb || (!unix_skb_len(skb) && (!oob_skb || next_skb == oob_skb))) answ = 1; } mutex_unlock(&u->iolock); err = put_user(answ, (int __user *)arg); } break; #endif default: err = -ENOIOCTLCMD; break; } return err; } #ifdef CONFIG_COMPAT static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } #endif static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (READ_ONCE(unix_sk(sk)->oob_skb)) mask |= EPOLLPRI; #endif /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && state == TCP_CLOSE) mask |= EPOLLHUP; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ if (unix_writable(sk, state)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk, *other; unsigned int writable; unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) mask |= EPOLLHUP; /* No write status requested, avoid expensive OUT tests. */ if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; writable = unix_writable(sk, state); if (writable) { unix_state_lock(sk); other = unix_peer(sk); if (other && unix_peer(other) != sk && unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0; unix_state_unlock(sk); } if (writable) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } #ifdef CONFIG_PROC_FS #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) #define get_bucket(x) ((x) >> BUCKET_SPACE) #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { unsigned long offset = get_offset(*pos); unsigned long bucket = get_bucket(*pos); unsigned long count = 0; struct sock *sk; for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); sk; sk = sk_next(sk)) { if (++count == offset) break; } return sk; } static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) { unsigned long bucket = get_bucket(*pos); struct net *net = seq_file_net(seq); struct sock *sk; while (bucket < UNIX_HASH_SIZE) { spin_lock(&net->unx.table.locks[bucket]); sk = unix_from_bucket(seq, pos); if (sk) return sk; spin_unlock(&net->unx.table.locks[bucket]); *pos = set_bucket_offset(++bucket, 1); } return NULL; } static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, loff_t *pos) { unsigned long bucket = get_bucket(*pos); sk = sk_next(sk); if (sk) return sk; spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); *pos = set_bucket_offset(++bucket, 1); return unix_get_first(seq, pos); } static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { if (!*pos) return SEQ_START_TOKEN; return unix_get_first(seq, pos); } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; if (v == SEQ_START_TOKEN) return unix_get_first(seq, pos); return unix_get_next(seq, v, pos); } static void unix_seq_stop(struct seq_file *seq, void *v) { struct sock *sk = v; if (sk) spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); } static int unix_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "Num RefCount Protocol Flags Type St " "Inode Path\n"); else { struct sock *s = v; struct unix_sock *u = unix_sk(s); unix_state_lock(s); seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", s, refcount_read(&s->sk_refcnt), 0, s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, s->sk_type, s->sk_socket ? (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); if (u->addr) { // under a hash table lock here int i, len; seq_putc(seq, ' '); i = 0; len = u->addr->len - offsetof(struct sockaddr_un, sun_path); if (u->addr->name->sun_path[0]) { len--; } else { seq_putc(seq, '@'); i++; } for ( ; i < len; i++) seq_putc(seq, u->addr->name->sun_path[i] ?: '@'); } unix_state_unlock(s); seq_putc(seq, '\n'); } return 0; } static const struct seq_operations unix_seq_ops = { .start = unix_seq_start, .next = unix_seq_next, .stop = unix_seq_stop, .show = unix_seq_show, }; #ifdef CONFIG_BPF_SYSCALL struct bpf_unix_iter_state { struct seq_net_private p; unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; struct sock **batch; bool st_bucket_done; }; struct bpf_iter__unix { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct unix_sock *, unix_sk); uid_t uid __aligned(8); }; static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) { struct bpf_iter__unix ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.unix_sk = unix_sk; ctx.uid = uid; return bpf_iter_run_prog(prog, &ctx); } static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) { struct bpf_unix_iter_state *iter = seq->private; unsigned int expected = 1; struct sock *sk; sock_hold(start_sk); iter->batch[iter->end_sk++] = start_sk; for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++] = sk; } expected++; } spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); return expected; } static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) { while (iter->cur_sk < iter->end_sk) sock_put(iter->batch[iter->cur_sk++]); } static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, unsigned int new_batch_sz) { struct sock **new_batch; new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER | __GFP_NOWARN); if (!new_batch) return -ENOMEM; bpf_iter_unix_put_batch(iter); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; return 0; } static struct sock *bpf_iter_unix_batch(struct seq_file *seq, loff_t *pos) { struct bpf_unix_iter_state *iter = seq->private; unsigned int expected; bool resized = false; struct sock *sk; if (iter->st_bucket_done) *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); again: /* Get a new batch */ iter->cur_sk = 0; iter->end_sk = 0; sk = unix_get_first(seq, pos); if (!sk) return NULL; /* Done */ expected = bpf_iter_unix_hold_batch(seq, sk); if (iter->end_sk == expected) { iter->st_bucket_done = true; return sk; } if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { resized = true; goto again; } return sk; } static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) { if (!*pos) return SEQ_START_TOKEN; /* bpf iter does not support lseek, so it always * continue from where it was stop()-ped. */ return bpf_iter_unix_batch(seq, pos); } static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_unix_iter_state *iter = seq->private; struct sock *sk; /* Whenever seq_next() is called, the iter->cur_sk is * done with seq_show(), so advance to the next sk in * the batch. */ if (iter->cur_sk < iter->end_sk) sock_put(iter->batch[iter->cur_sk++]); ++*pos; if (iter->cur_sk < iter->end_sk) sk = iter->batch[iter->cur_sk]; else sk = bpf_iter_unix_batch(seq, pos); return sk; } static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; uid_t uid; bool slow; int ret; if (v == SEQ_START_TOKEN) return 0; slow = lock_sock_fast(sk); if (unlikely(sk_unhashed(sk))) { ret = SEQ_SKIP; goto unlock; } uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = unix_prog_seq_show(prog, &meta, v, uid); unlock: unlock_sock_fast(sk, slow); return ret; } static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) { struct bpf_unix_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)unix_prog_seq_show(prog, &meta, v, 0); } if (iter->cur_sk < iter->end_sk) bpf_iter_unix_put_batch(iter); } static const struct seq_operations bpf_iter_unix_seq_ops = { .start = bpf_iter_unix_seq_start, .next = bpf_iter_unix_seq_next, .stop = bpf_iter_unix_seq_stop, .show = bpf_iter_unix_seq_show, }; #endif #endif static const struct net_proto_family unix_family_ops = { .family = PF_UNIX, .create = unix_create, .owner = THIS_MODULE, }; static int __net_init unix_net_init(struct net *net) { int i; net->unx.sysctl_max_dgram_qlen = 10; if (unix_sysctl_register(net)) goto out; #ifdef CONFIG_PROC_FS if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, sizeof(struct seq_net_private))) goto err_sysctl; #endif net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, sizeof(spinlock_t), GFP_KERNEL); if (!net->unx.table.locks) goto err_proc; net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!net->unx.table.buckets) goto free_locks; for (i = 0; i < UNIX_HASH_SIZE; i++) { spin_lock_init(&net->unx.table.locks[i]); lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); INIT_HLIST_HEAD(&net->unx.table.buckets[i]); } return 0; free_locks: kvfree(net->unx.table.locks); err_proc: #ifdef CONFIG_PROC_FS remove_proc_entry("unix", net->proc_net); err_sysctl: #endif unix_sysctl_unregister(net); out: return -ENOMEM; } static void __net_exit unix_net_exit(struct net *net) { kvfree(net->unx.table.buckets); kvfree(net->unx.table.locks); unix_sysctl_unregister(net); remove_proc_entry("unix", net->proc_net); } static struct pernet_operations unix_net_ops = { .init = unix_net_init, .exit = unix_net_exit, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) #define INIT_BATCH_SZ 16 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_unix_iter_state *iter = priv_data; int err; err = bpf_iter_init_seq_net(priv_data, aux); if (err) return err; err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); if (err) { bpf_iter_fini_seq_net(priv_data); return err; } return 0; } static void bpf_iter_fini_unix(void *priv_data) { struct bpf_unix_iter_state *iter = priv_data; bpf_iter_fini_seq_net(priv_data); kvfree(iter->batch); } static const struct bpf_iter_seq_info unix_seq_info = { .seq_ops = &bpf_iter_unix_seq_ops, .init_seq_private = bpf_iter_init_unix, .fini_seq_private = bpf_iter_fini_unix, .seq_priv_size = sizeof(struct bpf_unix_iter_state), }; static const struct bpf_func_proto * bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sk_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_sk_getsockopt_proto; default: return NULL; } } static struct bpf_iter_reg unix_reg_info = { .target = "unix", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__unix, unix_sk), PTR_TO_BTF_ID_OR_NULL }, }, .get_func_proto = bpf_iter_unix_get_func_proto, .seq_info = &unix_seq_info, }; static void __init bpf_iter_register(void) { unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; if (bpf_iter_reg_target(&unix_reg_info)) pr_warn("Warning: could not register bpf iterator unix\n"); } #endif static int __init af_unix_init(void) { int i, rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { spin_lock_init(&bsd_socket_locks[i]); INIT_HLIST_HEAD(&bsd_socket_buckets[i]); } rc = proto_register(&unix_dgram_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); goto out; } rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); proto_unregister(&unix_dgram_proto); goto out; } sock_register(&unix_family_ops); register_pernet_subsys(&unix_net_ops); unix_bpf_build_proto(); #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_register(); #endif out: return rc; } /* Later than subsys_initcall() because we depend on stuff initialised there */ fs_initcall(af_unix_init);
29 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/in.h> #include <net/tcp.h> #include <trace/events/sock.h> #include "rds.h" #include "tcp.h" void rds_tcp_keepalive(struct socket *sock) { /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ int keepcnt = 5; /* number of unack'ed probes before declaring dead */ sock_set_keepalive(sock->sk); tcp_sock_set_keepcnt(sock->sk, keepcnt); tcp_sock_set_keepidle(sock->sk, keepidle); /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. */ tcp_sock_set_keepintvl(sock->sk, keepidle); } /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the * client's ipaddr < server's ipaddr. Otherwise, close the accepted * socket and force a reconneect from smaller -> larger ip addr. The reason * we special case cp_index 0 is to allow the rds probe ping itself to itself * get through efficiently. * Since reconnects are only initiated from the node with the numerically * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side * by moving them to CONNECTING in this function. */ static struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) { int i; int npaths = max_t(int, 1, conn->c_npaths); /* for mprds, all paths MUST be initiated by the peer * with the smaller address. */ if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) { /* Make sure we initiate at least one path if this * has not already been done; rds_start_mprds() will * take care of additional paths, if necessary. */ if (npaths == 1) rds_conn_path_connect_if_down(&conn->c_path[0]); return NULL; } for (i = 0; i < npaths; i++) { struct rds_conn_path *cp = &conn->c_path[i]; if (rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING) || rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_CONNECTING)) { return cp->cp_transport_data; } } return NULL; } int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; struct rds_connection *conn; int ret; struct inet_sock *inet; struct rds_tcp_connection *rs_tcp = NULL; int conn_state; struct rds_conn_path *cp; struct in6_addr *my_addr, *peer_addr; struct proto_accept_arg arg = { .flags = O_NONBLOCK, .kern = true, }; #if !IS_ENABLED(CONFIG_IPV6) struct in6_addr saddr, daddr; #endif int dev_if = 0; if (!sock) /* module unload or netns delete in progress */ return -ENETUNREACH; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); if (ret) goto out; ret = sock->ops->accept(sock, new_sock, &arg); if (ret < 0) goto out; /* sock_create_lite() does not get a hold on the owner module so we * need to do it here. Note that sock_release() uses sock->ops to * determine if it needs to decrement the reference count. So set * sock->ops after calling accept() in case that fails. And there's * no need to do try_module_get() as the listener should have a hold * already. */ new_sock->ops = sock->ops; __module_get(new_sock->ops->owner); rds_tcp_keepalive(new_sock); if (!rds_tcp_tune(new_sock)) { ret = -EINVAL; goto out; } inet = inet_sk(new_sock->sk); #if IS_ENABLED(CONFIG_IPV6) my_addr = &new_sock->sk->sk_v6_rcv_saddr; peer_addr = &new_sock->sk->sk_v6_daddr; #else ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr); ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr); my_addr = &saddr; peer_addr = &daddr; #endif rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n", sock->sk->sk_family, my_addr, ntohs(inet->inet_sport), peer_addr, ntohs(inet->inet_dport)); #if IS_ENABLED(CONFIG_IPV6) /* sk_bound_dev_if is not set if the peer address is not link local * address. In this case, it happens that mcast_oif is set. So * just use it. */ if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) && !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) { struct ipv6_pinfo *inet6; inet6 = inet6_sk(new_sock->sk); dev_if = READ_ONCE(inet6->mcast_oif); } else { dev_if = new_sock->sk->sk_bound_dev_if; } #endif if (!rds_tcp_laddr_check(sock_net(sock->sk), peer_addr, dev_if)) { /* local address connection is only allowed via loopback */ ret = -EOPNOTSUPP; goto out; } conn = rds_conn_create(sock_net(sock->sk), my_addr, peer_addr, &rds_tcp_transport, 0, GFP_KERNEL, dev_if); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; } /* An incoming SYN request came in, and TCP just accepted it. * * If the client reboots, this conn will need to be cleaned up. * rds_tcp_state_change() will do that cleanup */ rs_tcp = rds_tcp_accept_one_path(conn); if (!rs_tcp) goto rst_nsk; mutex_lock(&rs_tcp->t_conn_path_lock); cp = rs_tcp->t_cpath; conn_state = rds_conn_path_state(cp); WARN_ON(conn_state == RDS_CONN_UP); if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) goto rst_nsk; if (rs_tcp->t_sock) { /* Duelling SYN has been handled in rds_tcp_accept_one() */ rds_tcp_reset_callbacks(new_sock, cp); /* rds_connect_path_complete() marks RDS_CONN_UP */ rds_connect_path_complete(cp, RDS_CONN_RESETTING); } else { rds_tcp_set_callbacks(new_sock, cp); rds_connect_path_complete(cp, RDS_CONN_CONNECTING); } new_sock = NULL; ret = 0; if (conn->c_npaths == 0) rds_send_ping(cp->cp_conn, cp->cp_index); goto out; rst_nsk: /* reset the newly returned accept sock and bail. * It is safe to set linger on new_sock because the RDS connection * has not been brought up on new_sock, so no RDS-level data could * be pending on it. By setting linger, we achieve the side-effect * of avoiding TIME_WAIT state on new_sock. */ sock_no_linger(new_sock->sk); kernel_sock_shutdown(new_sock, SHUT_RDWR); ret = 0; out: if (rs_tcp) mutex_unlock(&rs_tcp->t_conn_path_lock); if (new_sock) sock_release(new_sock); return ret; } void rds_tcp_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); trace_sk_data_ready(sk); rdsdebug("listen data ready sk %p\n", sk); read_lock_bh(&sk->sk_callback_lock); ready = sk->sk_user_data; if (!ready) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } /* * ->sk_data_ready is also called for a newly established child socket * before it has been accepted and the accepter has set up their * data_ready.. we only want to queue listen work for our listening * socket * * (*ready)() may be null if we are racing with netns delete, and * the listen socket is being torn down. */ if (sk->sk_state == TCP_LISTEN) rds_tcp_accept_work(sk); else ready = rds_tcp_listen_sock_def_readable(sock_net(sk)); out: read_unlock_bh(&sk->sk_callback_lock); if (ready) ready(sk); } struct socket *rds_tcp_listen_init(struct net *net, bool isv6) { struct socket *sock = NULL; struct sockaddr_storage ss; struct sockaddr_in6 *sin6; struct sockaddr_in *sin; int addr_len; int ret; ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) { rdsdebug("could not create %s listener socket: %d\n", isv6 ? "IPv6" : "IPv4", ret); goto out; } sock->sk->sk_reuse = SK_CAN_REUSE; tcp_sock_set_nodelay(sock->sk); write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = sock->sk->sk_data_ready; sock->sk->sk_data_ready = rds_tcp_listen_data_ready; write_unlock_bh(&sock->sk->sk_callback_lock); if (isv6) { sin6 = (struct sockaddr_in6 *)&ss; sin6->sin6_family = PF_INET6; sin6->sin6_addr = in6addr_any; sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); sin6->sin6_scope_id = 0; sin6->sin6_flowinfo = 0; addr_len = sizeof(*sin6); } else { sin = (struct sockaddr_in *)&ss; sin->sin_family = PF_INET; sin->sin_addr.s_addr = INADDR_ANY; sin->sin_port = (__force u16)htons(RDS_TCP_PORT); addr_len = sizeof(*sin); } ret = kernel_bind(sock, (struct sockaddr *)&ss, addr_len); if (ret < 0) { rdsdebug("could not bind %s listener socket: %d\n", isv6 ? "IPv6" : "IPv4", ret); goto out; } ret = sock->ops->listen(sock, 64); if (ret < 0) goto out; return sock; out: if (sock) sock_release(sock); return NULL; } void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor) { struct sock *sk; if (!sock) return; sk = sock->sk; /* serialize with and prevent further callbacks */ lock_sock(sk); write_lock_bh(&sk->sk_callback_lock); if (sk->sk_user_data) { sk->sk_data_ready = sk->sk_user_data; sk->sk_user_data = NULL; } write_unlock_bh(&sk->sk_callback_lock); release_sock(sk); /* wait for accepts to stop and close the socket */ flush_workqueue(rds_wq); flush_work(acceptor); sock_release(sock); }
1469 3544 3545 3534 24 3540 3535 24 3547 3545 2452 634 1465 1470 7 3547 3551 3548 3543 642 642 3551 3544 637 3551 3541 3551 3546 1484 2719 2718 2719 1482 1480 1484 1481 1480 314 1481 315 1482 1482 1484 315 1481 1481 1482 1481 1469 1483 1467 875 1467 22 105 315 1036 1034 1033 1034 1035 1036 315 1278 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt #include <linux/module.h> #include <linux/sched.h> #include <linux/perf_event.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/stringify.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> #include <linux/stop_machine.h> #include <linux/slab.h> #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/mmu_context.h> #include <linux/bsearch.h> #include <linux/sync_core.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/mce.h> #include <asm/nmi.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/insn.h> #include <asm/io.h> #include <asm/fixmap.h> #include <asm/paravirt.h> #include <asm/asm-prototypes.h> #include <asm/cfi.h> int __read_mostly alternatives_patched; EXPORT_SYMBOL_GPL(alternatives_patched); #define MAX_PATCH_LEN (255-1) #define DA_ALL (~0) #define DA_ALT 0x01 #define DA_RET 0x02 #define DA_RETPOLINE 0x04 #define DA_ENDBR 0x08 #define DA_SMP 0x10 static unsigned int debug_alternative; static int __init debug_alt(char *str) { if (str && *str == '=') str++; if (!str || kstrtouint(str, 0, &debug_alternative)) debug_alternative = DA_ALL; return 1; } __setup("debug-alternative", debug_alt); static int noreplace_smp; static int __init setup_noreplace_smp(char *str) { noreplace_smp = 1; return 1; } __setup("noreplace-smp", setup_noreplace_smp); #define DPRINTK(type, fmt, args...) \ do { \ if (debug_alternative & DA_##type) \ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) #define DUMP_BYTES(type, buf, len, fmt, args...) \ do { \ if (unlikely(debug_alternative & DA_##type)) { \ int j; \ \ if (!(len)) \ break; \ \ printk(KERN_DEBUG pr_fmt(fmt), ##args); \ for (j = 0; j < (len) - 1; j++) \ printk(KERN_CONT "%02hhx ", buf[j]); \ printk(KERN_CONT "%02hhx\n", buf[j]); \ } \ } while (0) static const unsigned char x86nops[] = { BYTES_NOP1, BYTES_NOP2, BYTES_NOP3, BYTES_NOP4, BYTES_NOP5, BYTES_NOP6, BYTES_NOP7, BYTES_NOP8, #ifdef CONFIG_64BIT BYTES_NOP9, BYTES_NOP10, BYTES_NOP11, #endif }; const unsigned char * const x86_nops[ASM_NOP_MAX+1] = { NULL, x86nops, x86nops + 1, x86nops + 1 + 2, x86nops + 1 + 2 + 3, x86nops + 1 + 2 + 3 + 4, x86nops + 1 + 2 + 3 + 4 + 5, x86nops + 1 + 2 + 3 + 4 + 5 + 6, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, #ifdef CONFIG_64BIT x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10, #endif }; /* * Nomenclature for variable names to simplify and clarify this code and ease * any potential staring at it: * * @instr: source address of the original instructions in the kernel text as * generated by the compiler. * * @buf: temporary buffer on which the patching operates. This buffer is * eventually text-poked into the kernel image. * * @replacement/@repl: pointer to the opcodes which are replacing @instr, located * in the .altinstr_replacement section. */ /* * Fill the buffer with a single effective instruction of size @len. * * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) * for every single-byte NOP, try to generate the maximally available NOP of * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and * *jump* over instead of executing long and daft NOPs. */ static void add_nop(u8 *buf, unsigned int len) { u8 *target = buf + len; if (!len) return; if (len <= ASM_NOP_MAX) { memcpy(buf, x86_nops[len], len); return; } if (len < 128) { __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE); buf += JMP8_INSN_SIZE; } else { __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE); buf += JMP32_INSN_SIZE; } for (;buf < target; buf++) *buf = INT3_INSN_OPCODE; } extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); /* * Matches NOP and NOPL, not any of the other possible NOPs. */ static bool insn_is_nop(struct insn *insn) { /* Anything NOP, but no REP NOP */ if (insn->opcode.bytes[0] == 0x90 && (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) return true; /* NOPL */ if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) return true; /* TODO: more nops */ return false; } /* * Find the offset of the first non-NOP instruction starting at @offset * but no further than @len. */ static int skip_nops(u8 *buf, int offset, int len) { struct insn insn; for (; offset < len; offset += insn.length) { if (insn_decode_kernel(&insn, &buf[offset])) break; if (!insn_is_nop(&insn)) break; } return offset; } /* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len) { for (int next, i = 0; i < len; i = next) { struct insn insn; if (insn_decode_kernel(&insn, &buf[i])) return; next = i + insn.length; if (insn_is_nop(&insn)) { int nop = i; /* Has the NOP already been optimized? */ if (i + insn.length == len) return; next = skip_nops(buf, next, len); add_nop(buf + nop, next - nop); DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next); } } } /* * In this context, "source" is where the instructions are placed in the * section .altinstr_replacement, for example during kernel build by the * toolchain. * "Destination" is where the instructions are being patched in by this * machinery. * * The source offset is: * * src_imm = target - src_next_ip (1) * * and the target offset is: * * dst_imm = target - dst_next_ip (2) * * so rework (1) as an expression for target like: * * target = src_imm + src_next_ip (1a) * * and substitute in (2) to get: * * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) * * Now, since the instruction stream is 'identical' at src and dst (it * is being copied after all) it can be stated that: * * src_next_ip = src + ip_offset * dst_next_ip = dst + ip_offset (4) * * Substitute (4) in (3) and observe ip_offset being cancelled out to * obtain: * * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) * = src_imm + src - dst + ip_offset - ip_offset * = src_imm + src - dst (5) * * IOW, only the relative displacement of the code block matters. */ #define apply_reloc_n(n_, p_, d_) \ do { \ s32 v = *(s##n_ *)(p_); \ v += (d_); \ BUG_ON((v >> 31) != (v >> (n_-1))); \ *(s##n_ *)(p_) = (s##n_)v; \ } while (0) static __always_inline void apply_reloc(int n, void *ptr, uintptr_t diff) { switch (n) { case 1: apply_reloc_n(8, ptr, diff); break; case 2: apply_reloc_n(16, ptr, diff); break; case 4: apply_reloc_n(32, ptr, diff); break; default: BUG(); } } static __always_inline bool need_reloc(unsigned long offset, u8 *src, size_t src_len) { u8 *target = src + offset; /* * If the target is inside the patched block, it's relative to the * block itself and does not need relocation. */ return (target < src || target > src + src_len); } static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { for (int next, i = 0; i < instrlen; i = next) { struct insn insn; if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) return; next = i + insn.length; switch (insn.opcode.bytes[0]) { case 0x0f: if (insn.opcode.bytes[1] < 0x80 || insn.opcode.bytes[1] > 0x8f) break; fallthrough; /* Jcc.d32 */ case 0x70 ... 0x7f: /* Jcc.d8 */ case JMP8_INSN_OPCODE: case JMP32_INSN_OPCODE: case CALL_INSN_OPCODE: if (need_reloc(next + insn.immediate.value, repl, repl_len)) { apply_reloc(insn.immediate.nbytes, buf + i + insn_offset_immediate(&insn), repl - instr); } /* * Where possible, convert JMP.d32 into JMP.d8. */ if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { s32 imm = insn.immediate.value; imm += repl - instr; imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; if ((imm >> 31) == (imm >> 7)) { buf[i+0] = JMP8_INSN_OPCODE; buf[i+1] = (s8)imm; memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2); } } break; } if (insn_rip_relative(&insn)) { if (need_reloc(next + insn.displacement.value, repl, repl_len)) { apply_reloc(insn.displacement.nbytes, buf + i + insn_offset_displacement(&insn), repl - instr); } } } } void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { __apply_relocation(buf, instr, instrlen, repl, repl_len); optimize_nops(instr, buf, instrlen); } /* Low-level backend functions usable from alternative code replacements. */ DEFINE_ASM_FUNC(nop_func, "", .entry.text); EXPORT_SYMBOL_GPL(nop_func); noinstr void BUG_func(void) { BUG(); } EXPORT_SYMBOL(BUG_func); #define CALL_RIP_REL_OPCODE 0xff #define CALL_RIP_REL_MODRM 0x15 /* * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, struct module *mod) { u8 *wr_instr = module_writable_address(mod, instr); void *target, *bug = &BUG_func; s32 disp; if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) { pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n"); BUG(); } if (a->instrlen != 6 || wr_instr[0] != CALL_RIP_REL_OPCODE || wr_instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ disp = *(s32 *)(wr_instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ target = *(void **)(instr + a->instrlen + disp); #else /* ff 15 00 00 00 00 call *0x0 */ /* target address is stored at disp. */ target = *(void **)disp; #endif if (!target) target = bug; /* (BUG_func - .) + (target - BUG_func) := target - . */ *(s32 *)(insn_buff + 1) += target - bug; if (target == &nop_func) return 0; return 5; } static inline u8 * instr_va(struct alt_instr *i) { return (u8 *)&i->instr_offset + i->instr_offset; } /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. * This implies that asymmetric systems where APs have less capabilities than * the boot processor are not handled. Tough. Make sure you disable such * features by hand. * * Marked "noinline" to cause control flow change and thus insn cache * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, struct alt_instr *end, struct module *mod) { u8 insn_buff[MAX_PATCH_LEN]; u8 *instr, *replacement; struct alt_instr *a, *b; DPRINTK(ALT, "alt table %px, -> %px", start, end); /* * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. * During the process, KASAN becomes confused seeing partial LA57 * conversion and triggers a false-positive out-of-bound report. * * Disable KASAN until the patching is complete. */ kasan_disable_current(); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. * Some kernel functions (e.g. memcpy, memset, etc) use this order to * patch code. * * So be careful if you want to change the scan order to any other * order. */ for (a = start; a < end; a++) { int insn_buff_sz = 0; u8 *wr_instr, *wr_replacement; /* * In case of nested ALTERNATIVE()s the outer alternative might * add more padding. To ensure consistent patching find the max * padding for all alt_instr entries for this site (nested * alternatives result in consecutive entries). */ for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) { u8 len = max(a->instrlen, b->instrlen); a->instrlen = b->instrlen = len; } instr = instr_va(a); wr_instr = module_writable_address(mod, instr); replacement = (u8 *)&a->repl_offset + a->repl_offset; wr_replacement = module_writable_address(mod, replacement); BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); /* * Patch if either: * - feature is present * - feature not present but ALT_FLAG_NOT is set to mean, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { memcpy(insn_buff, wr_instr, a->instrlen); optimize_nops(instr, insn_buff, a->instrlen); text_poke_early(wr_instr, insn_buff, a->instrlen); continue; } DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x", a->cpuid >> 5, a->cpuid & 0x1f, instr, instr, a->instrlen, replacement, a->replacementlen, a->flags); memcpy(insn_buff, wr_replacement, a->replacementlen); insn_buff_sz = a->replacementlen; if (a->flags & ALT_FLAG_DIRECT_CALL) { insn_buff_sz = alt_replace_call(instr, insn_buff, a, mod); if (insn_buff_sz < 0) continue; } for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); text_poke_early(wr_instr, insn_buff, insn_buff_sz); } kasan_enable_current(); } static inline bool is_jcc32(struct insn *insn) { /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; } #if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg */ static int emit_indirect(int op, int reg, u8 *bytes) { int i = 0; u8 modrm; switch (op) { case CALL_INSN_OPCODE: modrm = 0x10; /* Reg = 2; CALL r/m */ break; case JMP32_INSN_OPCODE: modrm = 0x20; /* Reg = 4; JMP r/m */ break; default: WARN_ON_ONCE(1); return -1; } if (reg >= 8) { bytes[i++] = 0x41; /* REX.B prefix */ reg -= 8; } modrm |= 0xc0; /* Mod = 3 */ modrm += reg; bytes[i++] = 0xff; /* opcode */ bytes[i++] = modrm; return i; } static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) { u8 op = insn->opcode.bytes[0]; int i = 0; /* * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional * tail-calls. Deal with them. */ if (is_jcc32(insn)) { bytes[i++] = op; op = insn->opcode.bytes[1]; goto clang_jcc; } if (insn->length == 6) bytes[i++] = 0x2e; /* CS-prefix */ switch (op) { case CALL_INSN_OPCODE: __text_gen_insn(bytes+i, op, addr+i, __x86_indirect_call_thunk_array[reg], CALL_INSN_SIZE); i += CALL_INSN_SIZE; break; case JMP32_INSN_OPCODE: clang_jcc: __text_gen_insn(bytes+i, op, addr+i, __x86_indirect_jump_thunk_array[reg], JMP32_INSN_SIZE); i += JMP32_INSN_SIZE; break; default: WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); return -1; } WARN_ON_ONCE(i != insn->length); return i; } /* * Rewrite the compiler generated retpoline thunk calls. * * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate * indirect instructions, avoiding the extra indirection. * * For example, convert: * * CALL __x86_indirect_thunk_\reg * * into: * * CALL *%\reg * * It also tries to inline spectre_v2=retpoline,lfence when size permits. */ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) { retpoline_thunk_t *target; int reg, ret, i = 0; u8 op, cc; target = addr + insn->length + insn->immediate.value; reg = target - __x86_indirect_thunk_array; if (WARN_ON_ONCE(reg & ~0xf)) return -1; /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) return emit_call_track_retpoline(addr, insn, reg, bytes); return -1; } op = insn->opcode.bytes[0]; /* * Convert: * * Jcc.d32 __x86_indirect_thunk_\reg * * into: * * Jncc.d8 1f * [ LFENCE ] * JMP *%\reg * [ NOP ] * 1: */ if (is_jcc32(insn)) { cc = insn->opcode.bytes[1] & 0xf; cc ^= 1; /* invert condition */ bytes[i++] = 0x70 + cc; /* Jcc.d8 */ bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ op = JMP32_INSN_OPCODE; } /* * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. */ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { bytes[i++] = 0x0f; bytes[i++] = 0xae; bytes[i++] = 0xe8; /* LFENCE */ } ret = emit_indirect(op, reg, bytes + i); if (ret < 0) return ret; i += ret; /* * The compiler is supposed to EMIT an INT3 after every unconditional * JMP instruction due to AMD BTC. However, if the compiler is too old * or MITIGATION_SLS isn't enabled, we still need an INT3 after * indirect JMPs even on Intel. */ if (op == JMP32_INSN_OPCODE && i < insn->length) bytes[i++] = INT3_INSN_OPCODE; for (; i < insn->length;) bytes[i++] = BYTES_NOP1; return i; } /* * Generated by 'objtool --retpoline'. */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; ret = insn_decode_kernel(&insn, wr_addr); if (WARN_ON_ONCE(ret < 0)) continue; op1 = insn.opcode.bytes[0]; op2 = insn.opcode.bytes[1]; switch (op1) { case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: break; case 0x0f: /* escape */ if (op2 >= 0x80 && op2 <= 0x8f) break; fallthrough; default: WARN_ON_ONCE(1); continue; } DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(addr, bytes, len); DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(wr_addr, bytes, len); } } } #ifdef CONFIG_MITIGATION_RETHUNK /* * Rewrite the compiler generated return thunk tail-calls. * * For example, convert: * * JMP __x86_return_thunk * * into: * * RET */ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; /* Patch the custom return thunks... */ if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { /* ... or patch them out if not needed. */ bytes[i++] = RET_INSN_OPCODE; } for (; i < insn->length;) bytes[i++] = INT3_INSN_OPCODE; return i; } void __init_or_module noinline apply_returns(s32 *start, s32 *end, struct module *mod) { s32 *s; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) static_call_force_reinit(); for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op; ret = insn_decode_kernel(&insn, wr_addr); if (WARN_ON_ONCE(ret < 0)) continue; op = insn.opcode.bytes[0]; if (op == JMP32_INSN_OPCODE) dest = addr + insn.length + insn.immediate.value; if (__static_call_fixup(addr, op, dest) || WARN_ONCE(dest != &__x86_return_thunk, "missing return thunk: %pS-%pS: %*ph", addr, dest, 5, addr)) continue; DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_return(addr, &insn, bytes); if (len == insn.length) { DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(wr_addr, bytes, len); } } } #else void __init_or_module noinline apply_returns(s32 *start, s32 *end, struct module *mod) { } #endif /* CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, struct module *mod) { } void __init_or_module noinline apply_returns(s32 *start, s32 *end, struct module *mod) { } #endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT static void poison_cfi(void *addr, void *wr_addr); static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr))) return; if (!is_endbr(endbr)) { WARN_ON_ONCE(warn); return; } DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); /* * When we have IBT, the lack of ENDBR will trigger #CP */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); text_poke_early(wr_addr, &poison, 4); } /* * Generated by: objtool --ibt * * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; void *wr_addr = module_writable_address(mod, addr); poison_endbr(addr, wr_addr, true); if (IS_ENABLED(CONFIG_FINEIBT)) poison_cfi(addr - 16, wr_addr - 16); } } #else void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } #endif /* CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_CFI_AUTO_DEFAULT #define __CFI_DEFAULT CFI_AUTO #elif defined(CONFIG_CFI_CLANG) #define __CFI_DEFAULT CFI_KCFI #else #define __CFI_DEFAULT CFI_OFF #endif enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; #ifdef CONFIG_CFI_CLANG struct bpf_insn; /* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */ extern unsigned int __bpf_prog_runX(const void *ctx, const struct bpf_insn *insn); /* * Force a reference to the external symbol so the compiler generates * __kcfi_typid. */ __ADDRESSABLE(__bpf_prog_runX); /* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ asm ( " .pushsection .data..ro_after_init,\"aw\",@progbits \n" " .type cfi_bpf_hash,@object \n" " .globl cfi_bpf_hash \n" " .p2align 2, 0x0 \n" "cfi_bpf_hash: \n" " .long __kcfi_typeid___bpf_prog_runX \n" " .size cfi_bpf_hash, 4 \n" " .popsection \n" ); /* Must match bpf_callback_t */ extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); __ADDRESSABLE(__bpf_callback_fn); /* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ asm ( " .pushsection .data..ro_after_init,\"aw\",@progbits \n" " .type cfi_bpf_subprog_hash,@object \n" " .globl cfi_bpf_subprog_hash \n" " .p2align 2, 0x0 \n" "cfi_bpf_subprog_hash: \n" " .long __kcfi_typeid___bpf_callback_fn \n" " .size cfi_bpf_subprog_hash, 4 \n" " .popsection \n" ); u32 cfi_get_func_hash(void *func) { u32 hash; func -= cfi_get_offset(); switch (cfi_mode) { case CFI_FINEIBT: func += 7; break; case CFI_KCFI: func += 1; break; default: return 0; } if (get_kernel_nofault(hash, func)) return 0; return hash; } #endif #ifdef CONFIG_FINEIBT static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. */ static u32 cfi_rehash(u32 hash) { hash ^= cfi_seed; while (unlikely(is_endbr(hash) || is_endbr(-hash))) { bool lsb = hash & 1; hash >>= 1; if (lsb) hash ^= 0x80200003; } return hash; } static __init int cfi_parse_cmdline(char *str) { if (!str) return -EINVAL; while (str) { char *next = strchr(str, ','); if (next) { *next = 0; next++; } if (!strcmp(str, "auto")) { cfi_mode = CFI_AUTO; } else if (!strcmp(str, "off")) { cfi_mode = CFI_OFF; cfi_rand = false; } else if (!strcmp(str, "kcfi")) { cfi_mode = CFI_KCFI; } else if (!strcmp(str, "fineibt")) { cfi_mode = CFI_FINEIBT; } else if (!strcmp(str, "norand")) { cfi_rand = false; } else { pr_err("Ignoring unknown cfi option (%s).", str); } str = next; } return 0; } early_param("cfi", cfi_parse_cmdline); /* * kCFI FineIBT * * __cfi_\func: __cfi_\func: * movl $0x12345678,%eax // 5 endbr64 // 4 * nop subl $0x12345678,%r10d // 7 * nop jz 1f // 2 * nop ud2 // 2 * nop 1: nop // 1 * nop * nop * nop * nop * nop * nop * nop * * * caller: caller: * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 * je 1f // 2 nop4 // 4 * ud2 // 2 * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 * */ asm( ".pushsection .rodata \n" "fineibt_preamble_start: \n" " endbr64 \n" " subl $0x12345678, %r10d \n" " je fineibt_preamble_end \n" " ud2 \n" " nop \n" "fineibt_preamble_end: \n" ".popsection\n" ); extern u8 fineibt_preamble_start[]; extern u8 fineibt_preamble_end[]; #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) #define fineibt_preamble_hash 7 asm( ".pushsection .rodata \n" "fineibt_caller_start: \n" " movl $0x12345678, %r10d \n" " sub $16, %r11 \n" ASM_NOP4 "fineibt_caller_end: \n" ".popsection \n" ); extern u8 fineibt_caller_start[]; extern u8 fineibt_caller_end[]; #define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) #define fineibt_caller_hash 2 #define fineibt_caller_jmp (fineibt_caller_size - 2) static u32 decode_preamble_hash(void *addr) { u8 *p = addr; /* b8 78 56 34 12 mov $0x12345678,%eax */ if (p[0] == 0xb8) return *(u32 *)(addr + 1); return 0; /* invalid hash value */ } static u32 decode_caller_hash(void *addr) { u8 *p = addr; /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ if (p[0] == 0x41 && p[1] == 0xba) return -*(u32 *)(addr + 2); /* e8 0c 78 56 34 12 jmp.d8 +12 */ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) return -*(u32 *)(addr + 2); return 0; /* invalid hash value */ } /* .retpoline_sites */ static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate * in tact for later usage. Also see decode_caller_hash() and * cfi_rewrite_callers(). */ const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; void *wr_addr; u32 hash; addr -= fineibt_caller_size; wr_addr = module_writable_address(mod, addr); hash = decode_caller_hash(wr_addr); if (!hash) /* nocfi callers */ continue; text_poke_early(wr_addr, jmp, 2); } return 0; } static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. */ const u8 mov[] = { 0x41, 0xba }; s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; void *wr_addr; u32 hash; addr -= fineibt_caller_size; wr_addr = module_writable_address(mod, addr); hash = decode_caller_hash(wr_addr); if (!hash) /* nocfi callers */ continue; text_poke_early(wr_addr, mov, 2); } return 0; } /* .cfi_sites */ static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; void *wr_addr = module_writable_address(mod, addr); u32 hash; hash = decode_preamble_hash(wr_addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); text_poke_early(wr_addr + 1, &hash, 4); } return 0; } static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; void *wr_addr = module_writable_address(mod, addr); u32 hash; hash = decode_preamble_hash(wr_addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size); WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678); text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4); } return 0; } static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; void *wr_addr = module_writable_address(mod, addr); poison_endbr(addr + 16, wr_addr + 16, false); } } /* .retpoline_sites */ static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; void *wr_addr; u32 hash; addr -= fineibt_caller_size; wr_addr = module_writable_address(mod, addr); hash = decode_caller_hash(wr_addr); if (hash) { hash = -cfi_rehash(hash); text_poke_early(wr_addr + 2, &hash, 4); } } return 0; } static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; void *wr_addr; u32 hash; addr -= fineibt_caller_size; wr_addr = module_writable_address(mod, addr); hash = decode_caller_hash(wr_addr); if (hash) { text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size); WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678); text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4); } /* rely on apply_retpolines() */ } return 0; } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, struct module *mod) { bool builtin = mod ? false : true; int ret; if (WARN_ONCE(fineibt_preamble_size != 16, "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) return; if (cfi_mode == CFI_AUTO) { cfi_mode = CFI_KCFI; if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) cfi_mode = CFI_FINEIBT; } /* * Rewrite the callers to not use the __cfi_ stubs, such that we might * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ ret = cfi_disable_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; if (cfi_rand) { if (builtin) { cfi_seed = get_random_u32(); cfi_bpf_hash = cfi_rehash(cfi_bpf_hash); cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } ret = cfi_rand_preamble(start_cfi, end_cfi, mod); if (ret) goto err; ret = cfi_rand_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; } switch (cfi_mode) { case CFI_OFF: if (builtin) pr_info("Disabling CFI\n"); return; case CFI_KCFI: ret = cfi_enable_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; if (builtin) pr_info("Using kCFI\n"); return; case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod); if (ret) goto err; /* rewrite the callers to target func()-16 */ ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ cfi_rewrite_endbr(start_cfi, end_cfi, mod); if (builtin) pr_info("Using FineIBT CFI\n"); return; default: break; } err: pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); } static inline void poison_hash(void *addr) { *(u32 *)addr = 0; } static void poison_cfi(void *addr, void *wr_addr) { switch (cfi_mode) { case CFI_FINEIBT: /* * __cfi_\func: * osp nopl (%rax) * subl $0, %r10d * jz 1f * ud2 * 1: nop */ poison_endbr(addr, wr_addr, false); poison_hash(wr_addr + fineibt_preamble_hash); break; case CFI_KCFI: /* * __cfi_\func: * movl $0, %eax * .skip 11, 0x90 */ poison_hash(wr_addr + 1); break; default: break; } } #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, struct module *mod) { } #ifdef CONFIG_X86_KERNEL_IBT static void poison_cfi(void *addr, void *wr_addr) { } #endif #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, struct module *mod) { return __apply_fineibt(start_retpoline, end_retpoline, start_cfi, end_cfi, mod); } #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { const s32 *poff; for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; if (!*poff || ptr < text || ptr >= text_end) continue; /* turn DS segment override prefix into lock prefix */ if (*ptr == 0x3e) text_poke(ptr, ((unsigned char []){0xf0}), 1); } } static void alternatives_smp_unlock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { const s32 *poff; for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; if (!*poff || ptr < text || ptr >= text_end) continue; /* turn lock prefix into DS segment override prefix */ if (*ptr == 0xf0) text_poke(ptr, ((unsigned char []){0x3E}), 1); } } struct smp_alt_module { /* what is this ??? */ struct module *mod; char *name; /* ptrs to lock prefixes */ const s32 *locks; const s32 *locks_end; /* .text segment, needed to avoid patching init code ;) */ u8 *text; u8 *text_end; struct list_head next; }; static LIST_HEAD(smp_alt_modules); static bool uniproc_patched = false; /* protected by text_mutex */ void __init_or_module alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) { struct smp_alt_module *smp; mutex_lock(&text_mutex); if (!uniproc_patched) goto unlock; if (num_possible_cpus() == 1) /* Don't bother remembering, we'll never have to undo it. */ goto smp_unlock; smp = kzalloc(sizeof(*smp), GFP_KERNEL); if (NULL == smp) /* we'll run the (safe but slow) SMP code then ... */ goto unlock; smp->mod = mod; smp->name = name; smp->locks = locks; smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n", smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); list_add_tail(&smp->next, &smp_alt_modules); smp_unlock: alternatives_smp_unlock(locks, locks_end, text, text_end); unlock: mutex_unlock(&text_mutex); } void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; mutex_lock(&text_mutex); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; list_del(&item->next); kfree(item); break; } mutex_unlock(&text_mutex); } void alternatives_enable_smp(void) { struct smp_alt_module *mod; /* Why bother if there are no other CPUs? */ BUG_ON(num_possible_cpus() == 1); mutex_lock(&text_mutex); if (uniproc_patched) { pr_info("switching to SMP code\n"); BUG_ON(num_online_cpus() != 1); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); uniproc_patched = false; } mutex_unlock(&text_mutex); } /* * Return 1 if the address range is reserved for SMP-alternatives. * Must hold text_mutex. */ int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; const s32 *poff; u8 *text_start = start; u8 *text_end = end; lockdep_assert_held(&text_mutex); list_for_each_entry(mod, &smp_alt_modules, next) { if (mod->text > text_end || mod->text_end < text_start) continue; for (poff = mod->locks; poff < mod->locks_end; poff++) { const u8 *ptr = (const u8 *)poff + *poff; if (text_start <= ptr && text_end > ptr) return 1; } } return 0; } #endif /* CONFIG_SMP */ /* * Self-test for the INT3 based CALL emulation code. * * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up * properly and that there is a stack gap between the INT3 frame and the * previous context. Without this gap doing a virtual PUSH on the interrupted * stack would corrupt the INT3 IRET frame. * * See entry_{32,64}.S for more details. */ /* * We define the int3_magic() function in assembly to control the calling * convention such that we can 'call' it from assembly. */ extern void int3_magic(unsigned int *ptr); /* defined in asm */ asm ( " .pushsection .init.text, \"ax\", @progbits\n" " .type int3_magic, @function\n" "int3_magic:\n" ANNOTATE_NOENDBR " movl $1, (%" _ASM_ARG1 ")\n" ASM_RET " .size int3_magic, .-int3_magic\n" " .popsection\n" ); extern void int3_selftest_ip(void); /* defined in asm below */ static int __init int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) { unsigned long selftest = (unsigned long)&int3_selftest_ip; struct die_args *args = data; struct pt_regs *regs = args->regs; OPTIMIZER_HIDE_VAR(selftest); if (!regs || user_mode(regs)) return NOTIFY_DONE; if (val != DIE_INT3) return NOTIFY_DONE; if (regs->ip - INT3_INSN_SIZE != selftest) return NOTIFY_DONE; int3_emulate_call(regs, (unsigned long)&int3_magic); return NOTIFY_STOP; } /* Must be noinline to ensure uniqueness of int3_selftest_ip. */ static noinline void __init int3_selftest(void) { static __initdata struct notifier_block int3_exception_nb = { .notifier_call = int3_exception_notify, .priority = INT_MAX-1, /* last */ }; unsigned int val = 0; BUG_ON(register_die_notifier(&int3_exception_nb)); /* * Basically: int3_magic(&val); but really complicated :-) * * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb * notifier above will emulate CALL for us. */ asm volatile ("int3_selftest_ip:\n\t" ANNOTATE_NOENDBR " int3; nop; nop; nop; nop\n\t" : ASM_CALL_CONSTRAINT : __ASM_SEL_RAW(a, D) (&val) : "memory"); BUG_ON(val != 1); unregister_die_notifier(&int3_exception_nb); } static __initdata int __alt_reloc_selftest_addr; extern void __init __alt_reloc_selftest(void *arg); __visible noinline void __init __alt_reloc_selftest(void *arg) { WARN_ON(arg != &__alt_reloc_selftest_addr); } static noinline void __init alt_reloc_selftest(void) { /* * Tests apply_relocation(). * * This has a relative immediate (CALL) in a place other than the first * instruction and additionally on x86_64 we get a RIP-relative LEA: * * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4 * * Getting this wrong will either crash and burn or tickle the WARN * above. */ asm_inline volatile ( ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) : ASM_CALL_CONSTRAINT : [mem] "m" (__alt_reloc_selftest_addr) : _ASM_ARG1 ); } void __init alternative_instructions(void) { int3_selftest(); /* * The patching is not fully atomic, so try to avoid local * interruptions that might execute the to be patched code. * Other CPUs are not running. */ stop_nmi(); /* * Don't stop machine check exceptions while patching. * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. * Ignoring it is worse than an unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code * patching. */ /* * Make sure to set (artificial) features depending on used paravirt * functions which can later influence alternative patching. */ paravirt_set_cap(); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, __cfi_sites, __cfi_sites_end, NULL); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL); apply_returns(__return_sites, __return_sites_end, NULL); apply_alternatives(__alt_instructions, __alt_instructions_end, NULL); /* * Now all calls are established. Apply the call thunks if * required. */ callthunks_patch_builtin_calls(); /* * Seal all functions that do not have their address taken. */ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { uniproc_patched = true; alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); } if (!uniproc_patched || num_possible_cpus() == 1) { free_init_pages("SMP alternatives", (unsigned long)__smp_locks, (unsigned long)__smp_locks_end); } #endif restart_nmi(); alternatives_patched = 1; alt_reloc_selftest(); } /** * text_poke_early - Update instructions on a live kernel at boot time * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * When you use this code to patch more than one byte of an instruction * you need to make sure that other CPUs cannot execute this code in parallel. * Also no thread must be currently preempted in the middle of these * instructions. And on the local CPU you need to be protected against NMI or * MCE handlers seeing an inconsistent instruction while you patch. */ void __init_or_module text_poke_early(void *addr, const void *opcode, size_t len) { unsigned long flags; if (boot_cpu_has(X86_FEATURE_NX) && is_module_text_address((unsigned long)addr)) { /* * Modules text is marked initially as non-executable, so the * code cannot be running and speculative code-fetches are * prevented. Just change the code. */ memcpy(addr, opcode, len); } else { local_irq_save(flags); memcpy(addr, opcode, len); sync_core(); local_irq_restore(flags); /* * Could also do a CLFLUSH here to speed up CPU recovery; but * that causes hangs on some VIA CPUs. */ } } typedef struct { struct mm_struct *mm; } temp_mm_state_t; /* * Using a temporary mm allows to set temporary mappings that are not accessible * by other CPUs. Such mappings are needed to perform sensitive memory writes * that override the kernel memory protections (e.g., W^X), without exposing the * temporary page-table mappings that are required for these write operations to * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the * mapping is torn down. * * Context: The temporary mm needs to be used exclusively by a single core. To * harden security IRQs must be disabled while the temporary mm is * loaded, thereby preventing interrupt handler bugs from overriding * the kernel memory protection. */ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) { temp_mm_state_t temp_state; lockdep_assert_irqs_disabled(); /* * Make sure not to be in TLB lazy mode, as otherwise we'll end up * with a stale address space WITHOUT being in lazy mode after * restoring the previous mm. */ if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) leave_mm(); temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); switch_mm_irqs_off(NULL, mm, current); /* * If breakpoints are enabled, disable them while the temporary mm is * used. Userspace might set up watchpoints on addresses that are used * in the temporary mm, which would lead to wrong signals being sent or * crashes. * * Note that breakpoints are not disabled selectively, which also causes * kernel breakpoints (e.g., perf's) to be disabled. This might be * undesirable, but still seems reasonable as the code that runs in the * temporary mm should be short. */ if (hw_breakpoint_active()) hw_breakpoint_disable(); return temp_state; } static inline void unuse_temporary_mm(temp_mm_state_t prev_state) { lockdep_assert_irqs_disabled(); switch_mm_irqs_off(NULL, prev_state.mm, current); /* * Restore the breakpoints if they were disabled before the temporary mm * was loaded. */ if (hw_breakpoint_active()) hw_breakpoint_restore(); } __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; static void text_poke_memcpy(void *dst, const void *src, size_t len) { memcpy(dst, src, len); } static void text_poke_memset(void *dst, const void *src, size_t len) { int c = *(const int *)src; memset(dst, c, len); } typedef void text_poke_f(void *dst, const void *src, size_t len); static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; temp_mm_state_t prev; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; pgprot_t pgprot; /* * While boot memory allocator is running we cannot use struct pages as * they are not yet initialized. There is no way to recover. */ BUG_ON(!after_bootmem); if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); if (cross_page_boundary) pages[1] = vmalloc_to_page(addr + PAGE_SIZE); } else { pages[0] = virt_to_page(addr); WARN_ON(!PageReserved(pages[0])); if (cross_page_boundary) pages[1] = virt_to_page(addr + PAGE_SIZE); } /* * If something went wrong, crash and burn since recovery paths are not * implemented. */ BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); /* * Map the page without the global bit, as TLB flushing is done with * flush_tlb_mm_range(), which is intended for non-global PTEs. */ pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); /* * The lock is not really needed, but this allows to avoid open-coding. */ ptep = get_locked_pte(poking_mm, poking_addr, &ptl); /* * This must not fail; preallocated in poking_init(). */ VM_BUG_ON(!ptep); local_irq_save(flags); pte = mk_pte(pages[0], pgprot); set_pte_at(poking_mm, poking_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ prev = use_temporary_mm(poking_mm); kasan_disable_current(); func((u8 *)poking_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* * Ensure that the PTE is only cleared after the instructions of memcpy * were issued by using a compiler barrier. */ barrier(); pte_clear(poking_mm, poking_addr, ptep); if (cross_page_boundary) pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ unuse_temporary_mm(prev); /* * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); if (func == text_poke_memcpy) { /* * If the text does not match what we just wrote then something is * fundamentally screwy; there's nothing we can really do about that. */ BUG_ON(memcmp(addr, src, len)); } local_irq_restore(flags); pte_unmap_unlock(ptep, ptl); return addr; } /** * text_poke - Update instructions on a live kernel * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * Only atomic text poke/set should be allowed when not doing early patching. * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. * * Note that the caller must ensure that if the modified code is part of a * module, the module would not be removed during poking. This can be achieved * by registering a module notifier, and ordering module removal and patching * through a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { lockdep_assert_held(&text_mutex); return __text_poke(text_poke_memcpy, addr, opcode, len); } /** * text_poke_kgdb - Update instructions on a live kernel by kgdb * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * Only atomic text poke/set should be allowed when not doing early patching. * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. * * Context: should only be used by kgdb, which ensures no other core is running, * despite the fact it does not hold the text_mutex. */ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) { return __text_poke(text_poke_memcpy, addr, opcode, len); } void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok) { unsigned long start = (unsigned long)addr; size_t patched = 0; if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) return NULL; while (patched < len) { unsigned long ptr = start + patched; size_t s; s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); patched += s; } return addr; } /** * text_poke_copy - Copy instructions into (an unused part of) RX memory * @addr: address to modify * @opcode: source of the copy * @len: length to copy, could be more than 2x PAGE_SIZE * * Not safe against concurrent execution; useful for JITs to dump * new code blocks into unused regions of RX memory. Can be used in * conjunction with synchronize_rcu_tasks() to wait for existing * execution to quiesce after having made sure no existing functions * pointers are live. */ void *text_poke_copy(void *addr, const void *opcode, size_t len) { mutex_lock(&text_mutex); addr = text_poke_copy_locked(addr, opcode, len, false); mutex_unlock(&text_mutex); return addr; } /** * text_poke_set - memset into (an unused part of) RX memory * @addr: address to modify * @c: the byte to fill the area with * @len: length to copy, could be more than 2x PAGE_SIZE * * This is useful to overwrite unused regions of RX memory with illegal * instructions. */ void *text_poke_set(void *addr, int c, size_t len) { unsigned long start = (unsigned long)addr; size_t patched = 0; if (WARN_ON_ONCE(core_kernel_text(start))) return NULL; mutex_lock(&text_mutex); while (patched < len) { unsigned long ptr = start + patched; size_t s; s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); patched += s; } mutex_unlock(&text_mutex); return addr; } static void do_sync_core(void *info) { sync_core(); } void text_poke_sync(void) { on_each_cpu(do_sync_core, NULL, 1); } /* * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ struct text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; u8 len; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; /* see text_poke_bp_batch() */ u8 old; }; struct bp_patching_desc { struct text_poke_loc *vec; int nr_entries; atomic_t refs; }; static struct bp_patching_desc bp_desc; static __always_inline struct bp_patching_desc *try_get_desc(void) { struct bp_patching_desc *desc = &bp_desc; if (!raw_atomic_inc_not_zero(&desc->refs)) return NULL; return desc; } static __always_inline void put_desc(void) { struct bp_patching_desc *desc = &bp_desc; smp_mb__before_atomic(); raw_atomic_dec(&desc->refs); } static __always_inline void *text_poke_addr(struct text_poke_loc *tp) { return _stext + tp->rel_addr; } static __always_inline int patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; if (key < text_poke_addr(tp)) return -1; if (key > text_poke_addr(tp)) return 1; return 0; } noinstr int poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; int ret = 0; void *ip; if (user_mode(regs)) return 0; /* * Having observed our INT3 instruction, we now must observe * bp_desc with non-zero refcount: * * bp_desc.refs = 1 INT3 * WMB RMB * write INT3 if (bp_desc.refs != 0) */ smp_rmb(); desc = try_get_desc(); if (!desc) return 0; /* * Discount the INT3. See text_poke_bp_batch(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ if (unlikely(desc->nr_entries > 1)) { tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, sizeof(struct text_poke_loc), patch_cmp); if (!tp) goto out_put; } else { tp = desc->vec; if (text_poke_addr(tp) != ip) goto out_put; } ip += tp->len; switch (tp->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, * do not consume. */ goto out_put; case RET_INSN_OPCODE: int3_emulate_ret(regs); break; case CALL_INSN_OPCODE: int3_emulate_call(regs, (long)ip + tp->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: int3_emulate_jmp(regs, (long)ip + tp->disp); break; case 0x70 ... 0x7f: /* Jcc */ int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); break; default: BUG(); } ret = 1; out_put: put_desc(); return ret; } #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) static struct text_poke_loc tp_vec[TP_VEC_MAX]; static int tp_vec_nr; /** * text_poke_bp_batch() -- update instructions on live kernel on SMP * @tp: vector of instructions to patch * @nr_entries: number of entries in the vector * * Modify multi-byte instruction by using int3 breakpoint on SMP. * We completely avoid stop_machine() here, and achieve the * synchronization using int3 breakpoint. * * The way it is done: * - For each entry in the vector: * - add a int3 trap to the address that will be patched * - sync cores * - For each entry in the vector: * - update all but the first byte of the patched range * - sync cores * - For each entry in the vector: * - replace the first byte (int3) by the first byte of * replacing opcode * - sync cores */ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; lockdep_assert_held(&text_mutex); bp_desc.vec = tp; bp_desc.nr_entries = nr_entries; /* * Corresponds to the implicit memory barrier in try_get_desc() to * ensure reading a non-zero refcount provides up to date bp_desc data. */ atomic_set_release(&bp_desc.refs, 1); /* * Function tracing can enable thousands of places that need to be * updated. This can take quite some time, and with full kernel debugging * enabled, this could cause the softlockup watchdog to trigger. * This function gets called every 256 entries added to be patched. * Call cond_resched() here to make sure that other tasks can get scheduled * while processing all the functions being patched. */ cond_resched(); /* * Corresponding read barrier in int3 notifier for making sure the * nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* * First step: add a int3 trap to the address that will be patched. */ for (i = 0; i < nr_entries; i++) { tp[i].old = *(u8 *)text_poke_addr(&tp[i]); text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); } text_poke_sync(); /* * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; u8 _new[POKE_MAX_OPCODE_SIZE+1]; const u8 *new = tp[i].text; int len = tp[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, text_poke_addr(&tp[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); if (len == 6) { _new[0] = 0x0f; memcpy(_new + 1, new, 5); new = _new; } text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); do_sync++; } /* * Emit a perf event to record the text poke, primarily to * support Intel PT decoding which must walk the executable code * to reconstruct the trace. The flow up to here is: * - write INT3 byte * - IPI-SYNC * - write instruction tail * At this point the actual control flow will be through the * INT3 and handler and not hit the old or new instruction. * Intel PT outputs FUP/TIP packets for the INT3, so the flow * can still be decoded. Subsequently: * - emit RECORD_TEXT_POKE with the new instruction * - IPI-SYNC * - write first byte * - IPI-SYNC * So before the text poke event timestamp, the decoder will see * either the old instruction flow or FUP/TIP of INT3. After the * text poke event timestamp, the decoder will see either the * new instruction flow or FUP/TIP of INT3. Thus decoders can * use the timestamp as the point at which to modify the * executable code. * The old instruction is recorded so that the event can be * processed forwards or backwards. */ perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); } if (do_sync) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ text_poke_sync(); } /* * Third step: replace the first byte (int3) by the first byte of * replacing opcode. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { u8 byte = tp[i].text[0]; if (tp[i].len == 6) byte = 0x0f; if (byte == INT3_INSN_OPCODE) continue; text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); do_sync++; } if (do_sync) text_poke_sync(); /* * Remove and wait for refs to be zero. */ if (!atomic_dec_and_test(&bp_desc.refs)) atomic_cond_read_acquire(&bp_desc.refs, !VAL); } static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, const void *opcode, size_t len, const void *emulate) { struct insn insn; int ret, i = 0; if (len == 6) i = 1; memcpy((void *)tp->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); tp->rel_addr = addr - (void *)_stext; tp->len = len; tp->opcode = insn.opcode.bytes[0]; if (is_jcc32(&insn)) { /* * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. */ tp->opcode = insn.opcode.bytes[1] - 0x10; } switch (tp->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: /* * Control flow instructions without implied execution of the * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) BUG_ON(tp->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); } switch (tp->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: case 0x70 ... 0x7f: /* Jcc */ tp->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP8_INSN_OPCODE; tp->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP32_INSN_OPCODE; tp->disp = 0; break; default: /* unknown instruction */ BUG(); } break; } } /* * We hard rely on the tp_vec being ordered; ensure this is so by flushing * early if needed. */ static bool tp_order_fail(void *addr) { struct text_poke_loc *tp; if (!tp_vec_nr) return false; if (!addr) /* force */ return true; tp = &tp_vec[tp_vec_nr - 1]; if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) return true; return false; } static void text_poke_flush(void *addr) { if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { text_poke_bp_batch(tp_vec, tp_vec_nr); tp_vec_nr = 0; } } void text_poke_finish(void) { text_poke_flush(NULL); } void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc *tp; text_poke_flush(addr); tp = &tp_vec[tp_vec_nr++]; text_poke_loc_init(tp, addr, opcode, len, emulate); } /** * text_poke_bp() -- update instructions on live kernel on SMP * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy * @emulate: instruction to be emulated * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc tp; text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); }
65 65 65 65 65 65 65 65 65 65 65 65 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_est.c: simple rate estimator for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * Affected data: est_list and est_lock. * estimation_timer() runs with timer per netns. * get_stats()) do the per cpu summing. */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/types.h> #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/list.h> #include <linux/rcupdate_wait.h> #include <net/ip_vs.h> /* This code is to estimate rate in a shorter interval (such as 8 seconds) for virtual services and real servers. For measure rate in a long interval, it is easy to implement a user level daemon which periodically reads those statistical counters and measure rate. We measure rate during the last 8 seconds every 2 seconds: avgrate = avgrate*(1-W) + rate*W where W = 2^(-2) NOTES. * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. * Netlink users can see 64-bit values but sockopt users are restricted to 32-bit values for conns, packets, bps, cps and pps. * A lot of code is taken from net/core/gen_estimator.c KEY POINTS: - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled - kthreads read the cpustats to update the estimators (svcs, dests, total) - the states of estimators can be read (get stats) or modified (zero stats) from processes KTHREADS: - estimators are added initially to est_temp_list and later kthread 0 distributes them to one or many kthreads for estimation - kthread contexts are created and attached to array - the kthread tasks are started when first service is added, before that the total stats are not estimated - when configuration (cpulist/nice) is changed, the tasks are restarted by work (est_reload_work) - kthread tasks are stopped while the cpulist is empty - the kthread context holds lists with estimators (chains) which are processed every 2 seconds - as estimators can be added dynamically and in bursts, we try to spread them to multiple chains which are estimated at different time - on start, kthread 0 enters calculation phase to determine the chain limits and the limit of estimators per kthread - est_add_ktid: ktid where to add new ests, can point to empty slot where we should add kt data */ static struct lock_class_key __ipvs_est_key; static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); static void ip_vs_chain_estimation(struct hlist_head *chain) { struct ip_vs_estimator *e; struct ip_vs_cpu_stats *c; struct ip_vs_stats *s; u64 rate; hlist_for_each_entry_rcu(e, chain, list) { u64 conns, inpkts, outpkts, inbytes, outbytes; u64 kconns = 0, kinpkts = 0, koutpkts = 0; u64 kinbytes = 0, koutbytes = 0; unsigned int start; int i; if (kthread_should_stop()) break; s = container_of(e, struct ip_vs_stats, est); for_each_possible_cpu(i) { c = per_cpu_ptr(s->cpustats, i); do { start = u64_stats_fetch_begin(&c->syncp); conns = u64_stats_read(&c->cnt.conns); inpkts = u64_stats_read(&c->cnt.inpkts); outpkts = u64_stats_read(&c->cnt.outpkts); inbytes = u64_stats_read(&c->cnt.inbytes); outbytes = u64_stats_read(&c->cnt.outbytes); } while (u64_stats_fetch_retry(&c->syncp, start)); kconns += conns; kinpkts += inpkts; koutpkts += outpkts; kinbytes += inbytes; koutbytes += outbytes; } spin_lock(&s->lock); s->kstats.conns = kconns; s->kstats.inpkts = kinpkts; s->kstats.outpkts = koutpkts; s->kstats.inbytes = kinbytes; s->kstats.outbytes = koutbytes; /* scaled by 2^10, but divided 2 seconds */ rate = (s->kstats.conns - e->last_conns) << 9; e->last_conns = s->kstats.conns; e->cps += ((s64)rate - (s64)e->cps) >> 2; rate = (s->kstats.inpkts - e->last_inpkts) << 9; e->last_inpkts = s->kstats.inpkts; e->inpps += ((s64)rate - (s64)e->inpps) >> 2; rate = (s->kstats.outpkts - e->last_outpkts) << 9; e->last_outpkts = s->kstats.outpkts; e->outpps += ((s64)rate - (s64)e->outpps) >> 2; /* scaled by 2^5, but divided 2 seconds */ rate = (s->kstats.inbytes - e->last_inbytes) << 4; e->last_inbytes = s->kstats.inbytes; e->inbps += ((s64)rate - (s64)e->inbps) >> 2; rate = (s->kstats.outbytes - e->last_outbytes) << 4; e->last_outbytes = s->kstats.outbytes; e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } } static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) { struct ip_vs_est_tick_data *td; int cid; rcu_read_lock(); td = rcu_dereference(kd->ticks[row]); if (!td) goto out; for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { if (kthread_should_stop()) break; ip_vs_chain_estimation(&td->chains[cid]); cond_resched_rcu(); td = rcu_dereference(kd->ticks[row]); if (!td) break; } out: rcu_read_unlock(); } static int ip_vs_estimation_kthread(void *data) { struct ip_vs_est_kt_data *kd = data; struct netns_ipvs *ipvs = kd->ipvs; int row = kd->est_row; unsigned long now; int id = kd->id; long gap; if (id > 0) { if (!ipvs->est_chain_max) return 0; } else { if (!ipvs->est_chain_max) { ipvs->est_calc_phase = 1; /* commit est_calc_phase before reading est_genid */ smp_mb(); } /* kthread 0 will handle the calc phase */ if (ipvs->est_calc_phase) ip_vs_est_calc_phase(ipvs); } while (1) { if (!id && !hlist_empty(&ipvs->est_temp_list)) ip_vs_est_drain_temp_list(ipvs); set_current_state(TASK_IDLE); if (kthread_should_stop()) break; /* before estimation, check if we should sleep */ now = jiffies; gap = kd->est_timer - now; if (gap > 0) { if (gap > IPVS_EST_TICK) { kd->est_timer = now - IPVS_EST_TICK; gap = IPVS_EST_TICK; } schedule_timeout(gap); } else { __set_current_state(TASK_RUNNING); if (gap < -8 * IPVS_EST_TICK) kd->est_timer = now; } if (kd->tick_len[row]) ip_vs_tick_estimation(kd, row); row++; if (row >= IPVS_EST_NTICKS) row = 0; WRITE_ONCE(kd->est_row, row); kd->est_timer += IPVS_EST_TICK; } __set_current_state(TASK_RUNNING); return 0; } /* Schedule stop/start for kthread tasks */ void ip_vs_est_reload_start(struct netns_ipvs *ipvs) { /* Ignore reloads before first service is added */ if (!ipvs->enable) return; ip_vs_est_stopped_recalc(ipvs); /* Bump the kthread configuration genid */ atomic_inc(&ipvs->est_genid); queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); } /* Start kthread task with current configuration */ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd) { unsigned long now; int ret = 0; long gap; lockdep_assert_held(&ipvs->est_mutex); if (kd->task) goto out; now = jiffies; gap = kd->est_timer - now; /* Sync est_timer if task is starting later */ if (abs(gap) > 4 * IPVS_EST_TICK) kd->est_timer = now; kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", ipvs->gen, kd->id); if (IS_ERR(kd->task)) { ret = PTR_ERR(kd->task); kd->task = NULL; goto out; } set_user_nice(kd->task, sysctl_est_nice(ipvs)); set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); pr_info("starting estimator thread %d...\n", kd->id); wake_up_process(kd->task); out: return ret; } void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) { if (kd->task) { pr_info("stopping estimator thread %d...\n", kd->id); kthread_stop(kd->task); kd->task = NULL; } } /* Apply parameters to kthread */ static void ip_vs_est_set_params(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd) { kd->chain_max = ipvs->est_chain_max; /* We are using single chain on RCU preemption */ if (IPVS_EST_TICK_CHAINS == 1) kd->chain_max *= IPVS_EST_CHAIN_FACTOR; kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; } /* Create and start estimation kthread in a free or new array slot */ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) { struct ip_vs_est_kt_data *kd = NULL; int id = ipvs->est_kt_count; int ret = -ENOMEM; void *arr = NULL; int i; if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && ipvs->enable && ipvs->est_max_threads) return -EINVAL; mutex_lock(&ipvs->est_mutex); for (i = 0; i < id; i++) { if (!ipvs->est_kt_arr[i]) break; } if (i >= id) { arr = krealloc_array(ipvs->est_kt_arr, id + 1, sizeof(struct ip_vs_est_kt_data *), GFP_KERNEL); if (!arr) goto out; ipvs->est_kt_arr = arr; } else { id = i; } kd = kzalloc(sizeof(*kd), GFP_KERNEL); if (!kd) goto out; kd->ipvs = ipvs; bitmap_fill(kd->avail, IPVS_EST_NTICKS); kd->est_timer = jiffies; kd->id = id; ip_vs_est_set_params(ipvs, kd); /* Pre-allocate stats used in calc phase */ if (!id && !kd->calc_stats) { kd->calc_stats = ip_vs_stats_alloc(); if (!kd->calc_stats) goto out; } /* Start kthread tasks only when services are present */ if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { ret = ip_vs_est_kthread_start(ipvs, kd); if (ret < 0) goto out; } if (arr) ipvs->est_kt_count++; ipvs->est_kt_arr[id] = kd; kd = NULL; /* Use most recent kthread for new ests */ ipvs->est_add_ktid = id; ret = 0; out: mutex_unlock(&ipvs->est_mutex); if (kd) { ip_vs_stats_free(kd->calc_stats); kfree(kd); } return ret; } /* Select ktid where to add new ests: available, unused or new slot */ static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) { int ktid, best = ipvs->est_kt_count; struct ip_vs_est_kt_data *kd; for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { kd = ipvs->est_kt_arr[ktid]; if (kd) { if (kd->est_count < kd->est_max_count) { best = ktid; break; } } else if (ktid < best) { best = ktid; } } ipvs->est_add_ktid = best; } /* Add estimator to current kthread (est_add_ktid) */ static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, struct ip_vs_estimator *est) { struct ip_vs_est_kt_data *kd = NULL; struct ip_vs_est_tick_data *td; int ktid, row, crow, cid, ret; int delay = est->ktrow; BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, "Too many chains for ktcid"); if (ipvs->est_add_ktid < ipvs->est_kt_count) { kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; if (kd) goto add_est; } ret = ip_vs_est_add_kthread(ipvs); if (ret < 0) goto out; kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; add_est: ktid = kd->id; /* For small number of estimators prefer to use few ticks, * otherwise try to add into the last estimated row. * est_row and add_row point after the row we should use */ if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) crow = READ_ONCE(kd->est_row); else crow = kd->add_row; crow += delay; if (crow >= IPVS_EST_NTICKS) crow -= IPVS_EST_NTICKS; /* Assume initial delay ? */ if (delay >= IPVS_EST_NTICKS - 1) { /* Preserve initial delay or decrease it if no space in tick */ row = crow; if (crow < IPVS_EST_NTICKS - 1) { crow++; row = find_last_bit(kd->avail, crow); } if (row >= crow) row = find_last_bit(kd->avail, IPVS_EST_NTICKS); } else { /* Preserve delay or increase it if no space in tick */ row = IPVS_EST_NTICKS; if (crow > 0) row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow); if (row >= IPVS_EST_NTICKS) row = find_first_bit(kd->avail, IPVS_EST_NTICKS); } td = rcu_dereference_protected(kd->ticks[row], 1); if (!td) { td = kzalloc(sizeof(*td), GFP_KERNEL); if (!td) { ret = -ENOMEM; goto out; } rcu_assign_pointer(kd->ticks[row], td); } cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS); kd->est_count++; kd->tick_len[row]++; if (!td->chain_len[cid]) __set_bit(cid, td->present); td->chain_len[cid]++; est->ktid = ktid; est->ktrow = row; est->ktcid = cid; hlist_add_head_rcu(&est->list, &td->chains[cid]); if (td->chain_len[cid] >= kd->chain_max) { __set_bit(cid, td->full); if (kd->tick_len[row] >= kd->tick_max) __clear_bit(row, kd->avail); } /* Update est_add_ktid to point to first available/empty kt slot */ if (kd->est_count == kd->est_max_count) ip_vs_est_update_ktid(ipvs); ret = 0; out: return ret; } /* Start estimation for stats */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; int ret; if (!ipvs->est_max_threads && ipvs->enable) ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); est->ktid = -1; est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ /* We prefer this code to be short, kthread 0 will requeue the * estimator to available chain. If tasks are disabled, we * will not allocate much memory, just for kt 0. */ ret = 0; if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) ret = ip_vs_est_add_kthread(ipvs); if (ret >= 0) hlist_add_head(&est->list, &ipvs->est_temp_list); else INIT_HLIST_NODE(&est->list); return ret; } static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) { if (kd) { if (kd->task) { pr_info("stop unused estimator thread %d...\n", kd->id); kthread_stop(kd->task); } ip_vs_stats_free(kd->calc_stats); kfree(kd); } } /* Unlink estimator from chain */ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_est_tick_data *td; struct ip_vs_est_kt_data *kd; int ktid = est->ktid; int row = est->ktrow; int cid = est->ktcid; /* Failed to add to chain ? */ if (hlist_unhashed(&est->list)) return; /* On return, estimator can be freed, dequeue it now */ /* In est_temp_list ? */ if (ktid < 0) { hlist_del(&est->list); goto end_kt0; } hlist_del_rcu(&est->list); kd = ipvs->est_kt_arr[ktid]; td = rcu_dereference_protected(kd->ticks[row], 1); __clear_bit(cid, td->full); td->chain_len[cid]--; if (!td->chain_len[cid]) __clear_bit(cid, td->present); kd->tick_len[row]--; __set_bit(row, kd->avail); if (!kd->tick_len[row]) { RCU_INIT_POINTER(kd->ticks[row], NULL); kfree_rcu(td, rcu_head); } kd->est_count--; if (kd->est_count) { /* This kt slot can become available just now, prefer it */ if (ktid < ipvs->est_add_ktid) ipvs->est_add_ktid = ktid; return; } if (ktid > 0) { mutex_lock(&ipvs->est_mutex); ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[ktid] = NULL; if (ktid == ipvs->est_kt_count - 1) { ipvs->est_kt_count--; while (ipvs->est_kt_count > 1 && !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) ipvs->est_kt_count--; } mutex_unlock(&ipvs->est_mutex); /* This slot is now empty, prefer another available kt slot */ if (ktid == ipvs->est_add_ktid) ip_vs_est_update_ktid(ipvs); } end_kt0: /* kt 0 is freed after all other kthreads and chains are empty */ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { kd = ipvs->est_kt_arr[0]; if (!kd || !kd->est_count) { mutex_lock(&ipvs->est_mutex); if (kd) { ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[0] = NULL; } ipvs->est_kt_count--; mutex_unlock(&ipvs->est_mutex); ipvs->est_add_ktid = 0; } } } /* Register all ests from est_temp_list to kthreads */ static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) { struct ip_vs_estimator *est; while (1) { int max = 16; mutex_lock(&__ip_vs_mutex); while (max-- > 0) { est = hlist_entry_safe(ipvs->est_temp_list.first, struct ip_vs_estimator, list); if (est) { if (kthread_should_stop()) goto unlock; hlist_del_init(&est->list); if (ip_vs_enqueue_estimator(ipvs, est) >= 0) continue; est->ktid = -1; hlist_add_head(&est->list, &ipvs->est_temp_list); /* Abort, some entries will not be estimated * until next attempt */ } goto unlock; } mutex_unlock(&__ip_vs_mutex); cond_resched(); } unlock: mutex_unlock(&__ip_vs_mutex); } /* Calculate limits for all kthreads */ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct ip_vs_est_kt_data *kd; struct hlist_head chain; struct ip_vs_stats *s; int cache_factor = 4; int i, loops, ntest; s32 min_est = 0; ktime_t t1, t2; int max = 8; int ret = 1; s64 diff; u64 val; INIT_HLIST_HEAD(&chain); mutex_lock(&__ip_vs_mutex); kd = ipvs->est_kt_arr[0]; mutex_unlock(&__ip_vs_mutex); s = kd ? kd->calc_stats : NULL; if (!s) goto out; hlist_add_head(&s->est.list, &chain); loops = 1; /* Get best result from many tests */ for (ntest = 0; ntest < 12; ntest++) { if (!(ntest & 3)) { /* Wait for cpufreq frequency transition */ wait_event_idle_timeout(wq, kthread_should_stop(), HZ / 50); if (!ipvs->enable || kthread_should_stop()) goto stop; } local_bh_disable(); rcu_read_lock(); /* Put stats in cache */ ip_vs_chain_estimation(&chain); t1 = ktime_get(); for (i = loops * cache_factor; i > 0; i--) ip_vs_chain_estimation(&chain); t2 = ktime_get(); rcu_read_unlock(); local_bh_enable(); if (!ipvs->enable || kthread_should_stop()) goto stop; cond_resched(); diff = ktime_to_ns(ktime_sub(t2, t1)); if (diff <= 1 * NSEC_PER_USEC) { /* Do more loops on low time resolution */ loops *= 2; continue; } if (diff >= NSEC_PER_SEC) continue; val = diff; do_div(val, loops); if (!min_est || val < min_est) { min_est = val; /* goal: 95usec per chain */ val = 95 * NSEC_PER_USEC; if (val >= min_est) { do_div(val, min_est); max = (int)val; } else { max = 1; } } } out: if (s) hlist_del_init(&s->est.list); *chain_max = max; return ret; stop: ret = 0; goto out; } /* Calculate the parameters and apply them in context of kt #0 * ECP: est_calc_phase * ECM: est_chain_max * ECP ECM Insert Chain enable Description * --------------------------------------------------------------------------- * 0 0 est_temp_list 0 create kt #0 context * 0 0 est_temp_list 0->1 service added, start kthread #0 task * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase * 1 0 est_temp_list 1 kt #0: determine est_chain_max, * stop tasks, move ests to est_temp_list * and free kd for kthreads 1..last * 1->0 0->N kt chains 1 ests can go to kthreads * 0 N kt chains 1 drain est_temp_list, create new kthread * contexts, start tasks, estimate */ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) { int genid = atomic_read(&ipvs->est_genid); struct ip_vs_est_tick_data *td; struct ip_vs_est_kt_data *kd; struct ip_vs_estimator *est; struct ip_vs_stats *stats; int id, row, cid, delay; bool last, last_td; int chain_max; int step; if (!ip_vs_est_calc_limits(ipvs, &chain_max)) return; mutex_lock(&__ip_vs_mutex); /* Stop all other tasks, so that we can immediately move the * estimators to est_temp_list without RCU grace period */ mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ if (!ipvs->enable) goto unlock2; kd = ipvs->est_kt_arr[id]; if (!kd) continue; ip_vs_est_kthread_stop(kd); } mutex_unlock(&ipvs->est_mutex); /* Move all estimators to est_temp_list but carefully, * all estimators and kthread data can be released while * we reschedule. Even for kthread 0. */ step = 0; /* Order entries in est_temp_list in ascending delay, so now * walk delay(desc), id(desc), cid(asc) */ delay = IPVS_EST_NTICKS; next_delay: delay--; if (delay < 0) goto end_dequeue; last_kt: /* Destroy contexts backwards */ id = ipvs->est_kt_count; next_kt: if (!ipvs->enable || kthread_should_stop()) goto unlock; id--; if (id < 0) goto next_delay; kd = ipvs->est_kt_arr[id]; if (!kd) goto next_kt; /* kt 0 can exist with empty chains */ if (!id && kd->est_count <= 1) goto next_delay; row = kd->est_row + delay; if (row >= IPVS_EST_NTICKS) row -= IPVS_EST_NTICKS; td = rcu_dereference_protected(kd->ticks[row], 1); if (!td) goto next_kt; cid = 0; walk_chain: if (kthread_should_stop()) goto unlock; step++; if (!(step & 63)) { /* Give chance estimators to be added (to est_temp_list) * and deleted (releasing kthread contexts) */ mutex_unlock(&__ip_vs_mutex); cond_resched(); mutex_lock(&__ip_vs_mutex); /* Current kt released ? */ if (id >= ipvs->est_kt_count) goto last_kt; if (kd != ipvs->est_kt_arr[id]) goto next_kt; /* Current td released ? */ if (td != rcu_dereference_protected(kd->ticks[row], 1)) goto next_kt; /* No fatal changes on the current kd and td */ } est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, list); if (!est) { cid++; if (cid >= IPVS_EST_TICK_CHAINS) goto next_kt; goto walk_chain; } /* We can cheat and increase est_count to protect kt 0 context * from release but we prefer to keep the last estimator */ last = kd->est_count <= 1; /* Do not free kt #0 data */ if (!id && last) goto next_delay; last_td = kd->tick_len[row] <= 1; stats = container_of(est, struct ip_vs_stats, est); ip_vs_stop_estimator(ipvs, stats); /* Tasks are stopped, move without RCU grace period */ est->ktid = -1; est->ktrow = row - kd->est_row; if (est->ktrow < 0) est->ktrow += IPVS_EST_NTICKS; hlist_add_head(&est->list, &ipvs->est_temp_list); /* kd freed ? */ if (last) goto next_kt; /* td freed ? */ if (last_td) goto next_kt; goto walk_chain; end_dequeue: /* All estimators removed while calculating ? */ if (!ipvs->est_kt_count) goto unlock; kd = ipvs->est_kt_arr[0]; if (!kd) goto unlock; kd->add_row = kd->est_row; ipvs->est_chain_max = chain_max; ip_vs_est_set_params(ipvs, kd); pr_info("using max %d ests per chain, %d per kthread\n", kd->chain_max, kd->est_max_count); /* Try to keep tot_stats in kt0, enqueue it early */ if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) && ipvs->tot_stats->s.est.ktid == -1) { hlist_del(&ipvs->tot_stats->s.est.list); hlist_add_head(&ipvs->tot_stats->s.est.list, &ipvs->est_temp_list); } mutex_lock(&ipvs->est_mutex); /* We completed the calc phase, new calc phase not requested */ if (genid == atomic_read(&ipvs->est_genid)) ipvs->est_calc_phase = 0; unlock2: mutex_unlock(&ipvs->est_mutex); unlock: mutex_unlock(&__ip_vs_mutex); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_kstats *k = &stats->kstats; /* reset counters, caller must hold the stats->lock lock */ est->last_inbytes = k->inbytes; est->last_outbytes = k->outbytes; est->last_conns = k->conns; est->last_inpkts = k->inpkts; est->last_outpkts = k->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; est->inbps = 0; est->outbps = 0; } /* Get decoded rates */ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) { struct ip_vs_estimator *e = &stats->est; dst->cps = (e->cps + 0x1FF) >> 10; dst->inpps = (e->inpps + 0x1FF) >> 10; dst->outpps = (e->outpps + 0x1FF) >> 10; dst->inbps = (e->inbps + 0xF) >> 5; dst->outbps = (e->outbps + 0xF) >> 5; } int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { INIT_HLIST_HEAD(&ipvs->est_temp_list); ipvs->est_kt_arr = NULL; ipvs->est_max_threads = 0; ipvs->est_calc_phase = 0; ipvs->est_chain_max = 0; ipvs->est_kt_count = 0; ipvs->est_add_ktid = 0; atomic_set(&ipvs->est_genid, 0); atomic_set(&ipvs->est_genid_done, 0); __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); return 0; } void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { int i; for (i = 0; i < ipvs->est_kt_count; i++) ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); kfree(ipvs->est_kt_arr); mutex_destroy(&ipvs->est_mutex); }
422 420 163 422 422 418 497 463 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_DST_METADATA_H #define __NET_DST_METADATA_H 1 #include <linux/skbuff.h> #include <net/ip_tunnels.h> #include <net/macsec.h> #include <net/dst.h> enum metadata_type { METADATA_IP_TUNNEL, METADATA_HW_PORT_MUX, METADATA_MACSEC, METADATA_XFRM, }; struct hw_port_info { struct net_device *lower_dev; u32 port_id; }; struct macsec_info { sci_t sci; }; struct xfrm_md_info { u32 if_id; int link; struct dst_entry *dst_orig; }; struct metadata_dst { struct dst_entry dst; enum metadata_type type; union { struct ip_tunnel_info tun_info; struct hw_port_info port_info; struct macsec_info macsec_info; struct xfrm_md_info xfrm_info; } u; }; static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb) { struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb); if (md_dst && md_dst->dst.flags & DST_METADATA) return md_dst; return NULL; } static inline struct ip_tunnel_info * skb_tunnel_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; if (md_dst && md_dst->type == METADATA_IP_TUNNEL) return &md_dst->u.tun_info; dst = skb_dst(skb); if (dst && dst->lwtstate && (dst->lwtstate->type == LWTUNNEL_ENCAP_IP || dst->lwtstate->type == LWTUNNEL_ENCAP_IP6)) return lwt_tun_info(dst->lwtstate); return NULL; } static inline struct xfrm_md_info *lwt_xfrm_info(struct lwtunnel_state *lwt) { return (struct xfrm_md_info *)lwt->data; } static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; if (md_dst && md_dst->type == METADATA_XFRM) return &md_dst->u.xfrm_info; dst = skb_dst(skb); if (dst && dst->lwtstate && dst->lwtstate->type == LWTUNNEL_ENCAP_XFRM) return lwt_xfrm_info(dst->lwtstate); return NULL; } static inline bool skb_valid_dst(const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); return dst && !(dst->flags & DST_METADATA); } static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, const struct sk_buff *skb_b) { const struct metadata_dst *a, *b; if (!(skb_a->_skb_refdst | skb_b->_skb_refdst)) return 0; a = (const struct metadata_dst *) skb_dst(skb_a); b = (const struct metadata_dst *) skb_dst(skb_b); if (!a != !b || a->type != b->type) return 1; switch (a->type) { case METADATA_HW_PORT_MUX: return memcmp(&a->u.port_info, &b->u.port_info, sizeof(a->u.port_info)); case METADATA_IP_TUNNEL: return memcmp(&a->u.tun_info, &b->u.tun_info, sizeof(a->u.tun_info) + a->u.tun_info.options_len); case METADATA_MACSEC: return memcmp(&a->u.macsec_info, &b->u.macsec_info, sizeof(a->u.macsec_info)); case METADATA_XFRM: return memcmp(&a->u.xfrm_info, &b->u.xfrm_info, sizeof(a->u.xfrm_info)); default: return 1; } } void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags); static inline struct metadata_dst *tun_rx_dst(int md_size) { struct metadata_dst *tun_dst; tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC); if (!tun_dst) return NULL; tun_dst->u.tun_info.options_len = 0; tun_dst->u.tun_info.mode = 0; return tun_dst; } static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); int md_size; struct metadata_dst *new_md; if (!md_dst || md_dst->type != METADATA_IP_TUNNEL) return ERR_PTR(-EINVAL); md_size = md_dst->u.tun_info.options_len; new_md = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC); if (!new_md) return ERR_PTR(-ENOMEM); unsafe_memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, sizeof(struct ip_tunnel_info) + md_size, /* metadata_dst_alloc() reserves room (md_size bytes) for * options right after the ip_tunnel_info struct. */); #ifdef CONFIG_DST_CACHE /* Unclone the dst cache if there is one */ if (new_md->u.tun_info.dst_cache.cache) { int ret; ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC); if (ret) { metadata_dst_free(new_md); return ERR_PTR(ret); } } #endif skb_dst_drop(skb); skb_dst_set(skb, &new_md->dst); return new_md; } static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb) { struct metadata_dst *dst; dst = tun_dst_unclone(skb); if (IS_ERR(dst)) return NULL; return &dst->u.tun_info; } static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, __be32 daddr, __u8 tos, __u8 ttl, __be16 tp_dst, const unsigned long *flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; ip_tunnel_key_init(&tun_dst->u.tun_info.key, saddr, daddr, tos, ttl, 0, 0, tp_dst, tunnel_id, flags); return tun_dst; } static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, const unsigned long *flags, __be64 tunnel_id, int md_size) { const struct iphdr *iph = ip_hdr(skb); return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, 0, flags, tunnel_id, md_size); } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 tos, __u8 ttl, __be16 tp_dst, __be32 label, const unsigned long *flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; info = &tun_dst->u.tun_info; info->mode = IP_TUNNEL_INFO_IPV6; ip_tunnel_flags_copy(info->key.tun_flags, flags); info->key.tun_id = tunnel_id; info->key.tp_src = 0; info->key.tp_dst = tp_dst; info->key.u.ipv6.src = *saddr; info->key.u.ipv6.dst = *daddr; info->key.tos = tos; info->key.ttl = ttl; info->key.label = label; return tun_dst; } static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, const unsigned long *flags, __be64 tunnel_id, int md_size) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr, ipv6_get_dsfield(ip6h), ip6h->hop_limit, 0, ip6_flowlabel(ip6h), flags, tunnel_id, md_size); } #endif /* __NET_DST_METADATA_H */
235 235 236 236 107 156 235 235 236 236 234 160 116 8 27 235 236 236 356 236 6 127 133 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> #include <linux/kstrtox.h> #include <linux/errno.h> #include <linux/wait.h> #include <linux/unistd.h> #include <linux/stddef.h> #include <linux/personality.h> #include <linux/uaccess.h> #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/context_tracking.h> #include <linux/entry-common.h> #include <linux/syscalls.h> #include <linux/rseq.h> #include <asm/processor.h> #include <asm/ucontext.h> #include <asm/fpu/signal.h> #include <asm/fpu/xstate.h> #include <asm/vdso.h> #include <asm/mce.h> #include <asm/sighandling.h> #include <asm/vm86.h> #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> #include <asm/shstk.h> static inline int is_ia32_compat_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_IA32_EMULATION) && ksig->ka.sa.sa_flags & SA_IA32_ABI; } static inline int is_ia32_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); } static inline int is_x32_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_X86_X32_ABI) && ksig->ka.sa.sa_flags & SA_X32_ABI; } /* * Enable all pkeys temporarily, so as to ensure that both the current * execution stack as well as the alternate signal stack are writeable. * The application can use any of the available pkeys to protect the * alternate signal stack, and we don't know which one it is, so enable * all. The PKRU register will be reset to init_pkru later in the flow, * in fpu__clear_user_states(), and it is the application's responsibility * to enable the appropriate pkey as the first step in the signal handler * so that the handler does not segfault. */ static inline u32 sig_prepare_pkru(void) { u32 orig_pkru = read_pkru(); write_pkru(0); return orig_pkru; } /* * Set up a signal frame. */ /* x86 ABI requires 16-byte alignment */ #define FRAME_ALIGNMENT 16UL #define MAX_FRAME_PADDING (FRAME_ALIGNMENT - 1) /* * Determine which stack to use.. */ void __user * get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { struct k_sigaction *ka = &ksig->ka; int ia32_frame = is_ia32_frame(ksig); /* Default to using normal stack */ bool nested_altstack = on_sig_stack(regs->sp); bool entering_altstack = false; unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; u32 pkru; /* redzone */ if (!ia32_frame) sp -= 128; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { /* * This checks nested_altstack via sas_ss_flags(). Sensible * programs use SS_AUTODISARM, which disables that check, and * programs that don't use SS_AUTODISARM get compatible. */ if (sas_ss_flags(sp) == 0) { sp = current->sas_ss_sp + current->sas_ss_size; entering_altstack = true; } } else if (ia32_frame && !nested_altstack && regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { /* This is the legacy signal stack switching. */ sp = (unsigned long) ka->sa.sa_restorer; entering_altstack = true; } sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size); *fpstate = (void __user *)sp; sp -= frame_size; if (ia32_frame) /* * Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; else sp = round_down(sp, FRAME_ALIGNMENT) - 8; /* * If we are on the alternate signal stack and would overflow it, don't. * Return an always-bogus address instead so we will die with SIGSEGV. */ if (unlikely((nested_altstack || entering_altstack) && !__on_sig_stack(sp))) { if (show_unhandled_signals && printk_ratelimit()) pr_info("%s[%d] overflowed sigaltstack\n", current->comm, task_pid_nr(current)); return (void __user *)-1L; } /* Update PKRU to enable access to the alternate signal stack. */ pkru = sig_prepare_pkru(); /* save i387 and extended state */ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) { /* * Restore PKRU to the original, user-defined value; disable * extra pkeys enabled for the alternate signal stack, if any. */ write_pkru(pkru); return (void __user *)-1L; } return (void __user *)sp; } /* * There are four different struct types for signal frame: sigframe_ia32, * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case * -- the largest size. It means the size for 64-bit apps is a bit more * than needed, but this keeps the code simple. */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) # define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32) #else # define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe) #endif /* * The FP state frame contains an XSAVE buffer which must be 64-byte aligned. * If a signal frame starts at an unaligned address, extra space is required. * This is the max alignment padding, conservatively. */ #define MAX_XSAVE_PADDING 63UL /* * The frame data is composed of the following areas and laid out as: * * ------------------------- * | alignment padding | * ------------------------- * | (f)xsave frame | * ------------------------- * | fsave header | * ------------------------- * | alignment padding | * ------------------------- * | siginfo + ucontext | * ------------------------- */ /* max_frame_size tells userspace the worst case signal stack size. */ static unsigned long __ro_after_init max_frame_size; static unsigned int __ro_after_init fpu_default_state_size; static int __init init_sigframe_size(void) { fpu_default_state_size = fpu__get_fpstate_size(); max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING; max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING; /* Userspace expects an aligned size. */ max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); pr_info("max sigframe size: %lu\n", max_frame_size); return 0; } early_initcall(init_sigframe_size); unsigned long get_sigframe_size(void) { return max_frame_size; } static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { /* Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ if (is_ia32_frame(ksig)) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) return ia32_setup_rt_frame(ksig, regs); else return ia32_setup_frame(ksig, regs); } else if (is_x32_frame(ksig)) { return x32_setup_rt_frame(ksig, regs); } else { return x64_setup_rt_frame(ksig, regs); } } static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { bool stepping, failed; struct fpu *fpu = &current->thread.fpu; if (v8086_mode(regs)) save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); /* Are we from a system call? */ if (syscall_get_nr(current, regs) != -1) { /* If so, check system call restarting.. */ switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; break; case -ERESTARTSYS: if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { regs->ax = -EINTR; break; } fallthrough; case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; break; } } /* * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now * so that register information in the sigcontext is correct and * then notify the tracer before entering the signal handler. */ stepping = test_thread_flag(TIF_SINGLESTEP); if (stepping) user_disable_single_step(current); failed = (setup_rt_frame(ksig, regs) < 0); if (!failed) { /* * Clear the direction flag as per the ABI for function entry. * * Clear RF when entering the signal handler, because * it might disable possible debug exception from the * signal handler. * * Clear TF for the case when it wasn't set by debugger to * avoid the recursive send_sigtrap() in SIGTRAP handler. */ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); /* * Ensure the signal handler starts with the new fpu state. */ fpu__clear_user_states(fpu); } signal_setup_done(failed, ksig, stepping); } static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { #ifdef CONFIG_IA32_EMULATION if (current->restart_block.arch_data & TS_COMPAT) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); #else return __NR_restart_syscall; #endif } /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ void arch_do_signal_or_restart(struct pt_regs *regs) { struct ksignal ksig; if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); return; } /* Did we come from a system call? */ if (syscall_get_nr(current, regs) != -1) { /* Restart the system call - no handlers present */ switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: regs->ax = get_nr_restart_syscall(regs); regs->ip -= 2; break; } } /* * If there's no signal to deliver, we just put the saved sigmask * back. */ restore_saved_sigmask(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; if (show_unhandled_signals && printk_ratelimit()) { printk("%s" "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } force_sig(SIGSEGV); } #ifdef CONFIG_DYNAMIC_SIGFRAME #ifdef CONFIG_STRICT_SIGALTSTACK_SIZE static bool strict_sigaltstack_size __ro_after_init = true; #else static bool strict_sigaltstack_size __ro_after_init = false; #endif static int __init strict_sas_size(char *arg) { return kstrtobool(arg, &strict_sigaltstack_size) == 0; } __setup("strict_sas_size", strict_sas_size); /* * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512 * exceeds that size already. As such programs might never use the * sigaltstack they just continued to work. While always checking against * the real size would be correct, this might be considered a regression. * * Therefore avoid the sanity check, unless enforced by kernel * configuration or command line option. * * When dynamic FPU features are supported, the check is also enforced when * the task has permissions to use dynamic features. Tasks which have no * permission are checked against the size of the non-dynamic feature set * if strict checking is enabled. This avoids forcing all tasks on the * system to allocate large sigaltstacks even if they are never going * to use a dynamic feature. As this is serialized via sighand::siglock * any permission request for a dynamic feature either happened already * or will see the newly install sigaltstack size in the permission checks. */ bool sigaltstack_size_valid(size_t ss_size) { unsigned long fsize = max_frame_size - fpu_default_state_size; u64 mask; lockdep_assert_held(&current->sighand->siglock); if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) return true; fsize += current->group_leader->thread.fpu.perm.__user_state_size; if (likely(ss_size > fsize)) return true; if (strict_sigaltstack_size) return ss_size > fsize; mask = current->group_leader->thread.fpu.perm.__state_perm; if (mask & XFEATURE_MASK_USER_DYNAMIC) return ss_size > fsize; return true; } #endif /* CONFIG_DYNAMIC_SIGFRAME */
65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 /* * Copyright (c) 2001 The Regents of the University of Michigan. * All rights reserved. * * Kendrick Smith <kmsmith@umich.edu> * Andy Adamson <kandros@umich.edu> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include <linux/file.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/addr.h> #include <linux/jhash.h> #include <linux/string_helpers.h> #include <linux/fsnotify.h> #include <linux/rhashtable.h> #include <linux/nfs_ssc.h> #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" #include "current_stateid.h" #include "netns.h" #include "pnfs.h" #include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PROC #define all_ones {{ ~0, ~0}, ~0} static const stateid_t one_stateid = { .si_generation = ~0, .si_opaque = all_ones, }; static const stateid_t zero_stateid = { /* all fields zero */ }; static const stateid_t currentstateid = { .si_generation = 1, }; static const stateid_t close_stateid = { .si_generation = 0xffffffffU, }; static u64 current_sessionid = 1; #define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) #define CURRENT_STATEID(stateid) (!memcmp((stateid), &currentstateid, sizeof(stateid_t))) #define CLOSE_STATEID(stateid) (!memcmp((stateid), &close_stateid, sizeof(stateid_t))) /* forward declarations */ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); static void nfs4_free_ol_stateid(struct nfs4_stid *stid); void nfsd4_end_grace(struct nfsd_net *nn); static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps); static void nfsd4_file_hash_remove(struct nfs4_file *fi); static void deleg_reaper(struct nfsd_net *nn); /* Locking: */ /* * Currently used for the del_recall_lru and file hash table. In an * effort to decrease the scope of the client_mutex, this spinlock may * eventually cover more: */ static DEFINE_SPINLOCK(state_lock); enum nfsd4_st_mutex_lock_subclass { OPEN_STATEID_MUTEX = 0, LOCK_STATEID_MUTEX = 1, }; /* * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for * the refcount on the open stateid to drop. */ static DECLARE_WAIT_QUEUE_HEAD(close_wq); /* * A waitqueue where a writer to clients/#/ctl destroying a client can * wait for cl_rpc_users to drop to 0 and then for the client to be * unhashed. */ static DECLARE_WAIT_QUEUE_HEAD(expiry_wq); static struct kmem_cache *client_slab; static struct kmem_cache *openowner_slab; static struct kmem_cache *lockowner_slab; static struct kmem_cache *file_slab; static struct kmem_cache *stateid_slab; static struct kmem_cache *deleg_slab; static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops; static struct workqueue_struct *laundry_wq; int nfsd4_create_laundry_wq(void) { int rc = 0; laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); if (laundry_wq == NULL) rc = -ENOMEM; return rc; } void nfsd4_destroy_laundry_wq(void) { destroy_workqueue(laundry_wq); } static bool is_session_dead(struct nfsd4_session *ses) { return ses->se_dead; } static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me) { if (atomic_read(&ses->se_ref) > ref_held_by_me) return nfserr_jukebox; ses->se_dead = true; return nfs_ok; } static bool is_client_expired(struct nfs4_client *clp) { return clp->cl_time == 0; } static void nfsd4_dec_courtesy_client_count(struct nfsd_net *nn, struct nfs4_client *clp) { if (clp->cl_state != NFSD4_ACTIVE) atomic_add_unless(&nn->nfsd_courtesy_clients, -1, 0); } static __be32 get_client_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); if (is_client_expired(clp)) return nfserr_expired; atomic_inc(&clp->cl_rpc_users); nfsd4_dec_courtesy_client_count(nn, clp); clp->cl_state = NFSD4_ACTIVE; return nfs_ok; } /* must be called under the client_lock */ static inline void renew_client_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (is_client_expired(clp)) { WARN_ON(1); printk("%s: client (clientid %08x/%08x) already expired\n", __func__, clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); return; } list_move_tail(&clp->cl_lru, &nn->client_lru); clp->cl_time = ktime_get_boottime_seconds(); nfsd4_dec_courtesy_client_count(nn, clp); clp->cl_state = NFSD4_ACTIVE; } static void put_client_renew_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); if (!atomic_dec_and_test(&clp->cl_rpc_users)) return; if (!is_client_expired(clp)) renew_client_locked(clp); else wake_up_all(&expiry_wq); } static void put_client_renew(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (!atomic_dec_and_lock(&clp->cl_rpc_users, &nn->client_lock)) return; if (!is_client_expired(clp)) renew_client_locked(clp); else wake_up_all(&expiry_wq); spin_unlock(&nn->client_lock); } static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses) { __be32 status; if (is_session_dead(ses)) return nfserr_badsession; status = get_client_locked(ses->se_client); if (status) return status; atomic_inc(&ses->se_ref); return nfs_ok; } static void nfsd4_put_session_locked(struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses)) free_session(ses); put_client_renew_locked(clp); } static void nfsd4_put_session(struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); spin_lock(&nn->client_lock); nfsd4_put_session_locked(ses); spin_unlock(&nn->client_lock); } static struct nfsd4_blocked_lock * find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, struct nfsd_net *nn) { struct nfsd4_blocked_lock *cur, *found = NULL; spin_lock(&nn->blocked_locks_lock); list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { if (fh_match(fh, &cur->nbl_fh)) { list_del_init(&cur->nbl_list); WARN_ON(list_empty(&cur->nbl_lru)); list_del_init(&cur->nbl_lru); found = cur; break; } } spin_unlock(&nn->blocked_locks_lock); if (found) locks_delete_block(&found->nbl_lock); return found; } static struct nfsd4_blocked_lock * find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, struct nfsd_net *nn) { struct nfsd4_blocked_lock *nbl; nbl = find_blocked_lock(lo, fh, nn); if (!nbl) { nbl = kmalloc(sizeof(*nbl), GFP_KERNEL); if (nbl) { INIT_LIST_HEAD(&nbl->nbl_list); INIT_LIST_HEAD(&nbl->nbl_lru); fh_copy_shallow(&nbl->nbl_fh, fh); locks_init_lock(&nbl->nbl_lock); kref_init(&nbl->nbl_kref); nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, &nfsd4_cb_notify_lock_ops, NFSPROC4_CLNT_CB_NOTIFY_LOCK); } } return nbl; } static void free_nbl(struct kref *kref) { struct nfsd4_blocked_lock *nbl; nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref); locks_release_private(&nbl->nbl_lock); kfree(nbl); } static void free_blocked_lock(struct nfsd4_blocked_lock *nbl) { locks_delete_block(&nbl->nbl_lock); kref_put(&nbl->nbl_kref, free_nbl); } static void remove_blocked_locks(struct nfs4_lockowner *lo) { struct nfs4_client *clp = lo->lo_owner.so_client; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct nfsd4_blocked_lock *nbl; LIST_HEAD(reaplist); /* Dequeue all blocked locks */ spin_lock(&nn->blocked_locks_lock); while (!list_empty(&lo->lo_blocked)) { nbl = list_first_entry(&lo->lo_blocked, struct nfsd4_blocked_lock, nbl_list); list_del_init(&nbl->nbl_list); WARN_ON(list_empty(&nbl->nbl_lru)); list_move(&nbl->nbl_lru, &reaplist); } spin_unlock(&nn->blocked_locks_lock); /* Now free them */ while (!list_empty(&reaplist)) { nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); free_blocked_lock(nbl); } } static void nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb) { struct nfsd4_blocked_lock *nbl = container_of(cb, struct nfsd4_blocked_lock, nbl_cb); locks_delete_block(&nbl->nbl_lock); } static int nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) { trace_nfsd_cb_notify_lock_done(&zero_stateid, task); /* * Since this is just an optimization, we don't try very hard if it * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and * just quit trying on anything else. */ switch (task->tk_status) { case -NFS4ERR_DELAY: rpc_delay(task, 1 * HZ); return 0; default: return 1; } } static void nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb) { struct nfsd4_blocked_lock *nbl = container_of(cb, struct nfsd4_blocked_lock, nbl_cb); free_blocked_lock(nbl); } static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { .prepare = nfsd4_cb_notify_lock_prepare, .done = nfsd4_cb_notify_lock_done, .release = nfsd4_cb_notify_lock_release, .opcode = OP_CB_NOTIFY_LOCK, }; /* * We store the NONE, READ, WRITE, and BOTH bits separately in the * st_{access,deny}_bmap field of the stateid, in order to track not * only what share bits are currently in force, but also what * combinations of share bits previous opens have used. This allows us * to enforce the recommendation in * https://datatracker.ietf.org/doc/html/rfc7530#section-16.19.4 that * the server return an error if the client attempt to downgrade to a * combination of share bits not explicable by closing some of its * previous opens. * * This enforcement is arguably incomplete, since we don't keep * track of access/deny bit combinations; so, e.g., we allow: * * OPEN allow read, deny write * OPEN allow both, deny none * DOWNGRADE allow read, deny none * * which we should reject. * * But you could also argue that our current code is already overkill, * since it only exists to return NFS4ERR_INVAL on incorrect client * behavior. */ static unsigned int bmap_to_share_mode(unsigned long bmap) { int i; unsigned int access = 0; for (i = 1; i < 4; i++) { if (test_bit(i, &bmap)) access |= i; } return access; } /* set share access for a given stateid */ static inline void set_access(u32 access, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << access; WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); stp->st_access_bmap |= mask; } /* clear share access for a given stateid */ static inline void clear_access(u32 access, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << access; WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); stp->st_access_bmap &= ~mask; } /* test whether a given stateid has access */ static inline bool test_access(u32 access, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << access; return (bool)(stp->st_access_bmap & mask); } /* set share deny for a given stateid */ static inline void set_deny(u32 deny, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << deny; WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); stp->st_deny_bmap |= mask; } /* clear share deny for a given stateid */ static inline void clear_deny(u32 deny, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << deny; WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); stp->st_deny_bmap &= ~mask; } /* test whether a given stateid is denying specific access */ static inline bool test_deny(u32 deny, struct nfs4_ol_stateid *stp) { unsigned char mask = 1 << deny; return (bool)(stp->st_deny_bmap & mask); } static int nfs4_access_to_omode(u32 access) { switch (access & NFS4_SHARE_ACCESS_BOTH) { case NFS4_SHARE_ACCESS_READ: return O_RDONLY; case NFS4_SHARE_ACCESS_WRITE: return O_WRONLY; case NFS4_SHARE_ACCESS_BOTH: return O_RDWR; } WARN_ON_ONCE(1); return O_RDONLY; } static inline int access_permit_read(struct nfs4_ol_stateid *stp) { return test_access(NFS4_SHARE_ACCESS_READ, stp) || test_access(NFS4_SHARE_ACCESS_BOTH, stp) || test_access(NFS4_SHARE_ACCESS_WRITE, stp); } static inline int access_permit_write(struct nfs4_ol_stateid *stp) { return test_access(NFS4_SHARE_ACCESS_WRITE, stp) || test_access(NFS4_SHARE_ACCESS_BOTH, stp); } static inline struct nfs4_stateowner * nfs4_get_stateowner(struct nfs4_stateowner *sop) { atomic_inc(&sop->so_count); return sop; } static int same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner) { return (sop->so_owner.len == owner->len) && 0 == memcmp(sop->so_owner.data, owner->data, owner->len); } static struct nfs4_openowner * find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, struct nfs4_client *clp) { struct nfs4_stateowner *so; lockdep_assert_held(&clp->cl_lock); list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[hashval], so_strhash) { if (!so->so_is_open_owner) continue; if (same_owner_str(so, &open->op_owner)) return openowner(nfs4_get_stateowner(so)); } return NULL; } static inline u32 opaque_hashval(const void *ptr, int nbytes) { unsigned char *cptr = (unsigned char *) ptr; u32 x = 0; while (nbytes--) { x *= 37; x += *cptr++; } return x; } void put_nfs4_file(struct nfs4_file *fi) { if (refcount_dec_and_test(&fi->fi_ref)) { nfsd4_file_hash_remove(fi); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); kfree_rcu(fi, fi_rcu); } } static struct nfsd_file * find_writeable_file_locked(struct nfs4_file *f) { struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); ret = nfsd_file_get(f->fi_fds[O_WRONLY]); if (!ret) ret = nfsd_file_get(f->fi_fds[O_RDWR]); return ret; } static struct nfsd_file * find_writeable_file(struct nfs4_file *f) { struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_writeable_file_locked(f); spin_unlock(&f->fi_lock); return ret; } static struct nfsd_file * find_readable_file_locked(struct nfs4_file *f) { struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); ret = nfsd_file_get(f->fi_fds[O_RDONLY]); if (!ret) ret = nfsd_file_get(f->fi_fds[O_RDWR]); return ret; } static struct nfsd_file * find_readable_file(struct nfs4_file *f) { struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_readable_file_locked(f); spin_unlock(&f->fi_lock); return ret; } static struct nfsd_file * find_rw_file(struct nfs4_file *f) { struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = nfsd_file_get(f->fi_fds[O_RDWR]); spin_unlock(&f->fi_lock); return ret; } struct nfsd_file * find_any_file(struct nfs4_file *f) { struct nfsd_file *ret; if (!f) return NULL; spin_lock(&f->fi_lock); ret = nfsd_file_get(f->fi_fds[O_RDWR]); if (!ret) { ret = nfsd_file_get(f->fi_fds[O_WRONLY]); if (!ret) ret = nfsd_file_get(f->fi_fds[O_RDONLY]); } spin_unlock(&f->fi_lock); return ret; } static struct nfsd_file *find_any_file_locked(struct nfs4_file *f) { lockdep_assert_held(&f->fi_lock); if (f->fi_fds[O_RDWR]) return f->fi_fds[O_RDWR]; if (f->fi_fds[O_WRONLY]) return f->fi_fds[O_WRONLY]; if (f->fi_fds[O_RDONLY]) return f->fi_fds[O_RDONLY]; return NULL; } static atomic_long_t num_delegations; unsigned long max_delegations; /* * Open owner state (share locks) */ /* hash tables for lock and open owners */ #define OWNER_HASH_BITS 8 #define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) #define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) { unsigned int ret; ret = opaque_hashval(ownername->data, ownername->len); return ret & OWNER_HASH_MASK; } static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp; static const struct rhashtable_params nfs4_file_rhash_params = { .key_len = sizeof_field(struct nfs4_file, fi_inode), .key_offset = offsetof(struct nfs4_file, fi_inode), .head_offset = offsetof(struct nfs4_file, fi_rlist), /* * Start with a single page hash table to reduce resizing churn * on light workloads. */ .min_size = 256, .automatic_shrinking = true, }; /* * Check if courtesy clients have conflicting access and resolve it if possible * * access: is op_share_access if share_access is true. * Check if access mode, op_share_access, would conflict with * the current deny mode of the file 'fp'. * access: is op_share_deny if share_access is false. * Check if the deny mode, op_share_deny, would conflict with * current access of the file 'fp'. * stp: skip checking this entry. * new_stp: normal open, not open upgrade. * * Function returns: * false - access/deny mode conflict with normal client. * true - no conflict or conflict with courtesy client(s) is resolved. */ static bool nfs4_resolve_deny_conflicts_locked(struct nfs4_file *fp, bool new_stp, struct nfs4_ol_stateid *stp, u32 access, bool share_access) { struct nfs4_ol_stateid *st; bool resolvable = true; unsigned char bmap; struct nfsd_net *nn; struct nfs4_client *clp; lockdep_assert_held(&fp->fi_lock); list_for_each_entry(st, &fp->fi_stateids, st_perfile) { /* ignore lock stateid */ if (st->st_openstp) continue; if (st == stp && new_stp) continue; /* check file access against deny mode or vice versa */ bmap = share_access ? st->st_deny_bmap : st->st_access_bmap; if (!(access & bmap_to_share_mode(bmap))) continue; clp = st->st_stid.sc_client; if (try_to_expire_client(clp)) continue; resolvable = false; break; } if (resolvable) { clp = stp->st_stid.sc_client; nn = net_generic(clp->net, nfsd_net_id); mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); } return resolvable; } static void __nfs4_file_get_access(struct nfs4_file *fp, u32 access) { lockdep_assert_held(&fp->fi_lock); if (access & NFS4_SHARE_ACCESS_WRITE) atomic_inc(&fp->fi_access[O_WRONLY]); if (access & NFS4_SHARE_ACCESS_READ) atomic_inc(&fp->fi_access[O_RDONLY]); } static __be32 nfs4_file_get_access(struct nfs4_file *fp, u32 access) { lockdep_assert_held(&fp->fi_lock); /* Does this access mode make sense? */ if (access & ~NFS4_SHARE_ACCESS_BOTH) return nfserr_inval; /* Does it conflict with a deny mode already set? */ if ((access & fp->fi_share_deny) != 0) return nfserr_share_denied; __nfs4_file_get_access(fp, access); return nfs_ok; } static __be32 nfs4_file_check_deny(struct nfs4_file *fp, u32 deny) { /* Common case is that there is no deny mode. */ if (deny) { /* Does this deny mode make sense? */ if (deny & ~NFS4_SHARE_DENY_BOTH) return nfserr_inval; if ((deny & NFS4_SHARE_DENY_READ) && atomic_read(&fp->fi_access[O_RDONLY])) return nfserr_share_denied; if ((deny & NFS4_SHARE_DENY_WRITE) && atomic_read(&fp->fi_access[O_WRONLY])) return nfserr_share_denied; } return nfs_ok; } static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) { might_lock(&fp->fi_lock); if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) { struct nfsd_file *f1 = NULL; struct nfsd_file *f2 = NULL; swap(f1, fp->fi_fds[oflag]); if (atomic_read(&fp->fi_access[1 - oflag]) == 0) swap(f2, fp->fi_fds[O_RDWR]); spin_unlock(&fp->fi_lock); if (f1) nfsd_file_put(f1); if (f2) nfsd_file_put(f2); } } static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) { WARN_ON_ONCE(access & ~NFS4_SHARE_ACCESS_BOTH); if (access & NFS4_SHARE_ACCESS_WRITE) __nfs4_file_put_access(fp, O_WRONLY); if (access & NFS4_SHARE_ACCESS_READ) __nfs4_file_put_access(fp, O_RDONLY); } /* * Allocate a new open/delegation state counter. This is needed for * pNFS for proper return on close semantics. * * Note that we only allocate it for pNFS-enabled exports, otherwise * all pointers to struct nfs4_clnt_odstate are always NULL. */ static struct nfs4_clnt_odstate * alloc_clnt_odstate(struct nfs4_client *clp) { struct nfs4_clnt_odstate *co; co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL); if (co) { co->co_client = clp; refcount_set(&co->co_odcount, 1); } return co; } static void hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co) { struct nfs4_file *fp = co->co_file; lockdep_assert_held(&fp->fi_lock); list_add(&co->co_perfile, &fp->fi_clnt_odstate); } static inline void get_clnt_odstate(struct nfs4_clnt_odstate *co) { if (co) refcount_inc(&co->co_odcount); } static void put_clnt_odstate(struct nfs4_clnt_odstate *co) { struct nfs4_file *fp; if (!co) return; fp = co->co_file; if (refcount_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { list_del(&co->co_perfile); spin_unlock(&fp->fi_lock); nfsd4_return_all_file_layouts(co->co_client, fp); kmem_cache_free(odstate_slab, co); } } static struct nfs4_clnt_odstate * find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new) { struct nfs4_clnt_odstate *co; struct nfs4_client *cl; if (!new) return NULL; cl = new->co_client; spin_lock(&fp->fi_lock); list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) { if (co->co_client == cl) { get_clnt_odstate(co); goto out; } } co = new; co->co_file = fp; hash_clnt_odstate_locked(new); out: spin_unlock(&fp->fi_lock); return co; } struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, void (*sc_free)(struct nfs4_stid *)) { struct nfs4_stid *stid; int new_id; stid = kmem_cache_zalloc(slab, GFP_KERNEL); if (!stid) return NULL; idr_preload(GFP_KERNEL); spin_lock(&cl->cl_lock); /* Reserving 0 for start of file in nfsdfs "states" file: */ new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 1, 0, GFP_NOWAIT); spin_unlock(&cl->cl_lock); idr_preload_end(); if (new_id < 0) goto out_free; stid->sc_free = sc_free; stid->sc_client = cl; stid->sc_stateid.si_opaque.so_id = new_id; stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; /* Will be incremented before return to client: */ refcount_set(&stid->sc_count, 1); spin_lock_init(&stid->sc_lock); INIT_LIST_HEAD(&stid->sc_cp_list); /* * It shouldn't be a problem to reuse an opaque stateid value. * I don't think it is for 4.1. But with 4.0 I worry that, for * example, a stray write retransmission could be accepted by * the server when it should have been rejected. Therefore, * adopt a trick from the sctp code to attempt to maximize the * amount of time until an id is reused, by ensuring they always * "increase" (mod INT_MAX): */ return stid; out_free: kmem_cache_free(slab, stid); return NULL; } /* * Create a unique stateid_t to represent each COPY. */ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, unsigned char cs_type) { int new_id; stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time; stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; idr_preload(GFP_KERNEL); spin_lock(&nn->s2s_cp_lock); new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT); stid->cs_stid.si_opaque.so_id = new_id; stid->cs_stid.si_generation = 1; spin_unlock(&nn->s2s_cp_lock); idr_preload_end(); if (new_id < 0) return 0; stid->cs_type = cs_type; return 1; } int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy) { return nfs4_init_cp_state(nn, &copy->cp_stateid, NFS4_COPY_STID); } struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn, struct nfs4_stid *p_stid) { struct nfs4_cpntf_state *cps; cps = kzalloc(sizeof(struct nfs4_cpntf_state), GFP_KERNEL); if (!cps) return NULL; cps->cpntf_time = ktime_get_boottime_seconds(); refcount_set(&cps->cp_stateid.cs_count, 1); if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID)) goto out_free; spin_lock(&nn->s2s_cp_lock); list_add(&cps->cp_list, &p_stid->sc_cp_list); spin_unlock(&nn->s2s_cp_lock); return cps; out_free: kfree(cps); return NULL; } void nfs4_free_copy_state(struct nfsd4_copy *copy) { struct nfsd_net *nn; if (copy->cp_stateid.cs_type != NFS4_COPY_STID) return; nn = net_generic(copy->cp_clp->net, nfsd_net_id); spin_lock(&nn->s2s_cp_lock); idr_remove(&nn->s2s_cp_stateids, copy->cp_stateid.cs_stid.si_opaque.so_id); spin_unlock(&nn->s2s_cp_lock); } static void nfs4_free_cpntf_statelist(struct net *net, struct nfs4_stid *stid) { struct nfs4_cpntf_state *cps; struct nfsd_net *nn; nn = net_generic(net, nfsd_net_id); spin_lock(&nn->s2s_cp_lock); while (!list_empty(&stid->sc_cp_list)) { cps = list_first_entry(&stid->sc_cp_list, struct nfs4_cpntf_state, cp_list); _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); } static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) { struct nfs4_stid *stid; stid = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_ol_stateid); if (!stid) return NULL; return openlockstateid(stid); } static void nfs4_free_deleg(struct nfs4_stid *stid) { struct nfs4_delegation *dp = delegstateid(stid); WARN_ON_ONCE(!list_empty(&stid->sc_cp_list)); WARN_ON_ONCE(!list_empty(&dp->dl_perfile)); WARN_ON_ONCE(!list_empty(&dp->dl_perclnt)); WARN_ON_ONCE(!list_empty(&dp->dl_recall_lru)); kmem_cache_free(deleg_slab, stid); atomic_long_dec(&num_delegations); } /* * When we recall a delegation, we should be careful not to hand it * out again straight away. * To ensure this we keep a pair of bloom filters ('new' and 'old') * in which the filehandles of recalled delegations are "stored". * If a filehandle appear in either filter, a delegation is blocked. * When a delegation is recalled, the filehandle is stored in the "new" * filter. * Every 30 seconds we swap the filters and clear the "new" one, * unless both are empty of course. This results in delegations for a * given filehandle being blocked for between 30 and 60 seconds. * * Each filter is 256 bits. We hash the filehandle to 32bit and use the * low 3 bytes as hash-table indices. * * 'blocked_delegations_lock', which is always taken in block_delegations(), * is used to manage concurrent access. Testing does not need the lock * except when swapping the two filters. */ static DEFINE_SPINLOCK(blocked_delegations_lock); static struct bloom_pair { int entries, old_entries; time64_t swap_time; int new; /* index into 'set' */ DECLARE_BITMAP(set[2], 256); } blocked_delegations; static int delegation_blocked(struct knfsd_fh *fh) { u32 hash; struct bloom_pair *bd = &blocked_delegations; if (bd->entries == 0) return 0; if (ktime_get_seconds() - bd->swap_time > 30) { spin_lock(&blocked_delegations_lock); if (ktime_get_seconds() - bd->swap_time > 30) { bd->entries -= bd->old_entries; bd->old_entries = bd->entries; bd->new = 1-bd->new; memset(bd->set[bd->new], 0, sizeof(bd->set[0])); bd->swap_time = ktime_get_seconds(); } spin_unlock(&blocked_delegations_lock); } hash = jhash(&fh->fh_raw, fh->fh_size, 0); if (test_bit(hash&255, bd->set[0]) && test_bit((hash>>8)&255, bd->set[0]) && test_bit((hash>>16)&255, bd->set[0])) return 1; if (test_bit(hash&255, bd->set[1]) && test_bit((hash>>8)&255, bd->set[1]) && test_bit((hash>>16)&255, bd->set[1])) return 1; return 0; } static void block_delegations(struct knfsd_fh *fh) { u32 hash; struct bloom_pair *bd = &blocked_delegations; hash = jhash(&fh->fh_raw, fh->fh_size, 0); spin_lock(&blocked_delegations_lock); __set_bit(hash&255, bd->set[bd->new]); __set_bit((hash>>8)&255, bd->set[bd->new]); __set_bit((hash>>16)&255, bd->set[bd->new]); if (bd->entries == 0) bd->swap_time = ktime_get_seconds(); bd->entries += 1; spin_unlock(&blocked_delegations_lock); } static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate, u32 dl_type) { struct nfs4_delegation *dp; struct nfs4_stid *stid; long n; dprintk("NFSD alloc_init_deleg\n"); n = atomic_long_inc_return(&num_delegations); if (n < 0 || n > max_delegations) goto out_dec; if (delegation_blocked(&fp->fi_fhandle)) goto out_dec; stid = nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg); if (stid == NULL) goto out_dec; dp = delegstateid(stid); /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid * 0 anyway just for consistency and use 1: */ dp->dl_stid.sc_stateid.si_generation = 1; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); dp->dl_clnt_odstate = odstate; get_clnt_odstate(odstate); dp->dl_type = dl_type; dp->dl_retries = 1; dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client, &nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR); dp->dl_cb_fattr.ncf_file_modified = false; get_nfs4_file(fp); dp->dl_stid.sc_file = fp; return dp; out_dec: atomic_long_dec(&num_delegations); return NULL; } void nfs4_put_stid(struct nfs4_stid *s) { struct nfs4_file *fp = s->sc_file; struct nfs4_client *clp = s->sc_client; might_lock(&clp->cl_lock); if (!refcount_dec_and_lock(&s->sc_count, &clp->cl_lock)) { wake_up_all(&close_wq); return; } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); if (s->sc_status & SC_STATUS_ADMIN_REVOKED) atomic_dec(&s->sc_client->cl_admin_revoked); nfs4_free_cpntf_statelist(clp->net, s); spin_unlock(&clp->cl_lock); s->sc_free(s); if (fp) put_nfs4_file(fp); } void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) { stateid_t *src = &stid->sc_stateid; spin_lock(&stid->sc_lock); if (unlikely(++src->si_generation == 0)) src->si_generation = 1; memcpy(dst, src, sizeof(*dst)); spin_unlock(&stid->sc_lock); } static void put_deleg_file(struct nfs4_file *fp) { struct nfsd_file *nf = NULL; spin_lock(&fp->fi_lock); if (--fp->fi_delegees == 0) swap(nf, fp->fi_deleg_file); spin_unlock(&fp->fi_lock); if (nf) nfsd_file_put(nf); } static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; struct nfsd_file *nf = fp->fi_deleg_file; WARN_ON_ONCE(!fp->fi_delegees); kernel_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); put_deleg_file(fp); } static void destroy_unhashed_deleg(struct nfs4_delegation *dp) { put_clnt_odstate(dp->dl_clnt_odstate); nfs4_unlock_deleg_lease(dp); nfs4_put_stid(&dp->dl_stid); } /** * nfs4_delegation_exists - Discover if this delegation already exists * @clp: a pointer to the nfs4_client we're granting a delegation to * @fp: a pointer to the nfs4_file we're granting a delegation on * * Return: * On success: true iff an existing delegation is found */ static bool nfs4_delegation_exists(struct nfs4_client *clp, struct nfs4_file *fp) { struct nfs4_delegation *searchdp = NULL; struct nfs4_client *searchclp = NULL; lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) { searchclp = searchdp->dl_stid.sc_client; if (clp == searchclp) { return true; } } return false; } /** * hash_delegation_locked - Add a delegation to the appropriate lists * @dp: a pointer to the nfs4_delegation we are adding. * @fp: a pointer to the nfs4_file we're granting a delegation on * * Return: * On success: NULL if the delegation was successfully hashed. * * On error: -EAGAIN if one was previously granted to this * nfs4_client for this nfs4_file. Delegation is not hashed. * */ static int hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) { struct nfs4_client *clp = dp->dl_stid.sc_client; lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); lockdep_assert_held(&clp->cl_lock); if (nfs4_delegation_exists(clp, fp)) return -EAGAIN; refcount_inc(&dp->dl_stid.sc_count); dp->dl_stid.sc_type = SC_TYPE_DELEG; list_add(&dp->dl_perfile, &fp->fi_delegations); list_add(&dp->dl_perclnt, &clp->cl_delegations); return 0; } static bool delegation_hashed(struct nfs4_delegation *dp) { return !(list_empty(&dp->dl_perfile)); } static bool unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask) { struct nfs4_file *fp = dp->dl_stid.sc_file; lockdep_assert_held(&state_lock); if (!delegation_hashed(dp)) return false; if (statusmask == SC_STATUS_REVOKED && dp->dl_stid.sc_client->cl_minorversion == 0) statusmask = SC_STATUS_CLOSED; dp->dl_stid.sc_status |= statusmask; if (statusmask & SC_STATUS_ADMIN_REVOKED) atomic_inc(&dp->dl_stid.sc_client->cl_admin_revoked); /* Ensure that deleg break won't try to requeue it */ ++dp->dl_time; spin_lock(&fp->fi_lock); list_del_init(&dp->dl_perclnt); list_del_init(&dp->dl_recall_lru); list_del_init(&dp->dl_perfile); spin_unlock(&fp->fi_lock); return true; } static void destroy_delegation(struct nfs4_delegation *dp) { bool unhashed; spin_lock(&state_lock); unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED); spin_unlock(&state_lock); if (unhashed) destroy_unhashed_deleg(dp); } /** * revoke_delegation - perform nfs4 delegation structure cleanup * @dp: pointer to the delegation * * This function assumes that it's called either from the administrative * interface (nfsd4_revoke_states()) that's revoking a specific delegation * stateid or it's called from a laundromat thread (nfsd4_landromat()) that * determined that this specific state has expired and needs to be revoked * (both mark state with the appropriate stid sc_status mode). It is also * assumed that a reference was taken on the @dp state. * * If this function finds that the @dp state is SC_STATUS_FREED it means * that a FREE_STATEID operation for this stateid has been processed and * we can proceed to removing it from recalled list. However, if @dp state * isn't marked SC_STATUS_FREED, it means we need place it on the cl_revoked * list and wait for the FREE_STATEID to arrive from the client. At the same * time, we need to mark it as SC_STATUS_FREEABLE to indicate to the * nfsd4_free_stateid() function that this stateid has already been added * to the cl_revoked list and that nfsd4_free_stateid() is now responsible * for removing it from the list. Inspection of where the delegation state * in the revocation process is protected by the clp->cl_lock. */ static void revoke_delegation(struct nfs4_delegation *dp) { struct nfs4_client *clp = dp->dl_stid.sc_client; WARN_ON(!list_empty(&dp->dl_recall_lru)); WARN_ON_ONCE(!(dp->dl_stid.sc_status & (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED))); trace_nfsd_stid_revoke(&dp->dl_stid); spin_lock(&clp->cl_lock); if (dp->dl_stid.sc_status & SC_STATUS_FREED) { list_del_init(&dp->dl_recall_lru); goto out; } list_add(&dp->dl_recall_lru, &clp->cl_revoked); dp->dl_stid.sc_status |= SC_STATUS_FREEABLE; out: spin_unlock(&clp->cl_lock); destroy_unhashed_deleg(dp); } /* * SETCLIENTID state */ static unsigned int clientid_hashval(u32 id) { return id & CLIENT_HASH_MASK; } static unsigned int clientstr_hashval(struct xdr_netobj name) { return opaque_hashval(name.data, 8) & CLIENT_HASH_MASK; } /* * A stateid that had a deny mode associated with it is being released * or downgraded. Recalculate the deny mode on the file. */ static void recalculate_deny_mode(struct nfs4_file *fp) { struct nfs4_ol_stateid *stp; u32 old_deny; spin_lock(&fp->fi_lock); old_deny = fp->fi_share_deny; fp->fi_share_deny = 0; list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap); if (fp->fi_share_deny == old_deny) break; } spin_unlock(&fp->fi_lock); } static void reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp) { int i; bool change = false; for (i = 1; i < 4; i++) { if ((i & deny) != i) { change = true; clear_deny(i, stp); } } /* Recalculate per-file deny mode if there was a change */ if (change) recalculate_deny_mode(stp->st_stid.sc_file); } /* release all access and file references for a given stateid */ static void release_all_access(struct nfs4_ol_stateid *stp) { int i; struct nfs4_file *fp = stp->st_stid.sc_file; if (fp && stp->st_deny_bmap != 0) recalculate_deny_mode(fp); for (i = 1; i < 4; i++) { if (test_access(i, stp)) nfs4_file_put_access(stp->st_stid.sc_file, i); clear_access(i, stp); } } static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop) { kfree(sop->so_owner.data); sop->so_ops->so_free(sop); } static void nfs4_put_stateowner(struct nfs4_stateowner *sop) { struct nfs4_client *clp = sop->so_client; might_lock(&clp->cl_lock); if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock)) return; sop->so_ops->so_unhash(sop); spin_unlock(&clp->cl_lock); nfs4_free_stateowner(sop); } static bool nfs4_ol_stateid_unhashed(const struct nfs4_ol_stateid *stp) { return list_empty(&stp->st_perfile); } static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_file *fp = stp->st_stid.sc_file; lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock); if (list_empty(&stp->st_perfile)) return false; spin_lock(&fp->fi_lock); list_del_init(&stp->st_perfile); spin_unlock(&fp->fi_lock); list_del(&stp->st_perstateowner); return true; } static void nfs4_free_ol_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); put_clnt_odstate(stp->st_clnt_odstate); release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); WARN_ON(!list_empty(&stid->sc_cp_list)); kmem_cache_free(stateid_slab, stid); } static void nfs4_free_lock_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); struct nfsd_file *nf; nf = find_any_file(stp->st_stid.sc_file); if (nf) { get_file(nf->nf_file); filp_close(nf->nf_file, (fl_owner_t)lo); nfsd_file_put(nf); } nfs4_free_ol_stateid(stid); } /* * Put the persistent reference to an already unhashed generic stateid, while * holding the cl_lock. If it's the last reference, then put it onto the * reaplist for later destruction. */ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, struct list_head *reaplist) { struct nfs4_stid *s = &stp->st_stid; struct nfs4_client *clp = s->sc_client; lockdep_assert_held(&clp->cl_lock); WARN_ON_ONCE(!list_empty(&stp->st_locks)); if (!refcount_dec_and_test(&s->sc_count)) { wake_up_all(&close_wq); return; } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); if (s->sc_status & SC_STATUS_ADMIN_REVOKED) atomic_dec(&s->sc_client->cl_admin_revoked); list_add(&stp->st_locks, reaplist); } static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) { lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); if (!unhash_ol_stateid(stp)) return false; list_del_init(&stp->st_locks); stp->st_stid.sc_status |= SC_STATUS_CLOSED; return true; } static void release_lock_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_client *clp = stp->st_stid.sc_client; bool unhashed; spin_lock(&clp->cl_lock); unhashed = unhash_lock_stateid(stp); spin_unlock(&clp->cl_lock); if (unhashed) nfs4_put_stid(&stp->st_stid); } static void unhash_lockowner_locked(struct nfs4_lockowner *lo) { struct nfs4_client *clp = lo->lo_owner.so_client; lockdep_assert_held(&clp->cl_lock); list_del_init(&lo->lo_owner.so_strhash); } /* * Free a list of generic stateids that were collected earlier after being * fully unhashed. */ static void free_ol_stateid_reaplist(struct list_head *reaplist) { struct nfs4_ol_stateid *stp; struct nfs4_file *fp; might_sleep(); while (!list_empty(reaplist)) { stp = list_first_entry(reaplist, struct nfs4_ol_stateid, st_locks); list_del(&stp->st_locks); fp = stp->st_stid.sc_file; stp->st_stid.sc_free(&stp->st_stid); if (fp) put_nfs4_file(fp); } } static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, struct list_head *reaplist) { struct nfs4_ol_stateid *stp; lockdep_assert_held(&open_stp->st_stid.sc_client->cl_lock); while (!list_empty(&open_stp->st_locks)) { stp = list_entry(open_stp->st_locks.next, struct nfs4_ol_stateid, st_locks); unhash_lock_stateid(stp); put_ol_stateid_locked(stp, reaplist); } } static bool unhash_open_stateid(struct nfs4_ol_stateid *stp, struct list_head *reaplist) { lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); if (!unhash_ol_stateid(stp)) return false; release_open_stateid_locks(stp, reaplist); return true; } static void release_open_stateid(struct nfs4_ol_stateid *stp) { LIST_HEAD(reaplist); spin_lock(&stp->st_stid.sc_client->cl_lock); stp->st_stid.sc_status |= SC_STATUS_CLOSED; if (unhash_open_stateid(stp, &reaplist)) put_ol_stateid_locked(stp, &reaplist); spin_unlock(&stp->st_stid.sc_client->cl_lock); free_ol_stateid_reaplist(&reaplist); } static bool nfs4_openowner_unhashed(struct nfs4_openowner *oo) { lockdep_assert_held(&oo->oo_owner.so_client->cl_lock); return list_empty(&oo->oo_owner.so_strhash) && list_empty(&oo->oo_perclient); } static void unhash_openowner_locked(struct nfs4_openowner *oo) { struct nfs4_client *clp = oo->oo_owner.so_client; lockdep_assert_held(&clp->cl_lock); list_del_init(&oo->oo_owner.so_strhash); list_del_init(&oo->oo_perclient); } static void release_last_closed_stateid(struct nfs4_openowner *oo) { struct nfsd_net *nn = net_generic(oo->oo_owner.so_client->net, nfsd_net_id); struct nfs4_ol_stateid *s; spin_lock(&nn->client_lock); s = oo->oo_last_closed_stid; if (s) { list_del_init(&oo->oo_close_lru); oo->oo_last_closed_stid = NULL; } spin_unlock(&nn->client_lock); if (s) nfs4_put_stid(&s->st_stid); } static void release_openowner(struct nfs4_openowner *oo) { struct nfs4_ol_stateid *stp; struct nfs4_client *clp = oo->oo_owner.so_client; LIST_HEAD(reaplist); spin_lock(&clp->cl_lock); unhash_openowner_locked(oo); while (!list_empty(&oo->oo_owner.so_stateids)) { stp = list_first_entry(&oo->oo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); if (unhash_open_stateid(stp, &reaplist)) put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); release_last_closed_stateid(oo); nfs4_put_stateowner(&oo->oo_owner); } static struct nfs4_stid *find_one_sb_stid(struct nfs4_client *clp, struct super_block *sb, unsigned int sc_types) { unsigned long id, tmp; struct nfs4_stid *stid; spin_lock(&clp->cl_lock); idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) if ((stid->sc_type & sc_types) && stid->sc_status == 0 && stid->sc_file->fi_inode->i_sb == sb) { refcount_inc(&stid->sc_count); break; } spin_unlock(&clp->cl_lock); return stid; } /** * nfsd4_revoke_states - revoke all nfsv4 states associated with given filesystem * @net: used to identify instance of nfsd (there is one per net namespace) * @sb: super_block used to identify target filesystem * * All nfs4 states (open, lock, delegation, layout) held by the server instance * and associated with a file on the given filesystem will be revoked resulting * in any files being closed and so all references from nfsd to the filesystem * being released. Thus nfsd will no longer prevent the filesystem from being * unmounted. * * The clients which own the states will subsequently being notified that the * states have been "admin-revoked". */ void nfsd4_revoke_states(struct net *net, struct super_block *sb) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); unsigned int idhashval; unsigned int sc_types; sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG | SC_TYPE_LAYOUT; spin_lock(&nn->client_lock); for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) { struct list_head *head = &nn->conf_id_hashtbl[idhashval]; struct nfs4_client *clp; retry: list_for_each_entry(clp, head, cl_idhash) { struct nfs4_stid *stid = find_one_sb_stid(clp, sb, sc_types); if (stid) { struct nfs4_ol_stateid *stp; struct nfs4_delegation *dp; struct nfs4_layout_stateid *ls; spin_unlock(&nn->client_lock); switch (stid->sc_type) { case SC_TYPE_OPEN: stp = openlockstateid(stid); mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); spin_lock(&clp->cl_lock); if (stid->sc_status == 0) { stid->sc_status |= SC_STATUS_ADMIN_REVOKED; atomic_inc(&clp->cl_admin_revoked); spin_unlock(&clp->cl_lock); release_all_access(stp); } else spin_unlock(&clp->cl_lock); mutex_unlock(&stp->st_mutex); break; case SC_TYPE_LOCK: stp = openlockstateid(stid); mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); spin_lock(&clp->cl_lock); if (stid->sc_status == 0) { struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); struct nfsd_file *nf; stid->sc_status |= SC_STATUS_ADMIN_REVOKED; atomic_inc(&clp->cl_admin_revoked); spin_unlock(&clp->cl_lock); nf = find_any_file(stp->st_stid.sc_file); if (nf) { get_file(nf->nf_file); filp_close(nf->nf_file, (fl_owner_t)lo); nfsd_file_put(nf); } release_all_access(stp); } else spin_unlock(&clp->cl_lock); mutex_unlock(&stp->st_mutex); break; case SC_TYPE_DELEG: refcount_inc(&stid->sc_count); dp = delegstateid(stid); spin_lock(&state_lock); if (!unhash_delegation_locked( dp, SC_STATUS_ADMIN_REVOKED)) dp = NULL; spin_unlock(&state_lock); if (dp) revoke_delegation(dp); break; case SC_TYPE_LAYOUT: ls = layoutstateid(stid); nfsd4_close_layout(ls); break; } nfs4_put_stid(stid); spin_lock(&nn->client_lock); if (clp->cl_minorversion == 0) /* Allow cleanup after a lease period. * store_release ensures cleanup will * see any newly revoked states if it * sees the time updated. */ nn->nfs40_last_revoke = ktime_get_boottime_seconds(); goto retry; } } } spin_unlock(&nn->client_lock); } static inline int hash_sessionid(struct nfs4_sessionid *sessionid) { struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid; return sid->sequence % SESSION_HASH_SIZE; } #ifdef CONFIG_SUNRPC_DEBUG static inline void dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) { u32 *ptr = (u32 *)(&sessionid->data[0]); dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]); } #else static inline void dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) { } #endif /* * Bump the seqid on cstate->replay_owner, and clear replay_owner if it * won't be used for replay. */ void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr) { struct nfs4_stateowner *so = cstate->replay_owner; if (nfserr == nfserr_replay_me) return; if (!seqid_mutating_err(ntohl(nfserr))) { nfsd4_cstate_clear_replay(cstate); return; } if (!so) return; if (so->so_is_open_owner) release_last_closed_stateid(openowner(so)); so->so_seqid++; return; } static void gen_sessionid(struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; struct nfsd4_sessionid *sid; sid = (struct nfsd4_sessionid *)ses->se_sessionid.data; sid->clientid = clp->cl_clientid; sid->sequence = current_sessionid++; sid->reserved = 0; } /* * The protocol defines ca_maxresponssize_cached to include the size of * the rpc header, but all we need to cache is the data starting after * the end of the initial SEQUENCE operation--the rest we regenerate * each time. Therefore we can advertise a ca_maxresponssize_cached * value that is the number of bytes in our cache plus a few additional * bytes. In order to stay on the safe side, and not promise more than * we can cache, those additional bytes must be the minimum possible: 24 * bytes of rpc header (xid through accept state, with AUTH_NULL * verifier), 12 for the compound header (with zero-length tag), and 44 * for the SEQUENCE op response: */ #define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) static struct shrinker *nfsd_slot_shrinker; static DEFINE_SPINLOCK(nfsd_session_list_lock); static LIST_HEAD(nfsd_session_list); /* The sum of "target_slots-1" on every session. The shrinker can push this * down, though it can take a little while for the memory to actually * be freed. The "-1" is because we can never free slot 0 while the * session is active. */ static atomic_t nfsd_total_target_slots = ATOMIC_INIT(0); static void free_session_slots(struct nfsd4_session *ses, int from) { int i; if (from >= ses->se_fchannel.maxreqs) return; for (i = from; i < ses->se_fchannel.maxreqs; i++) { struct nfsd4_slot *slot = xa_load(&ses->se_slots, i); /* * Save the seqid in case we reactivate this slot. * This will never require a memory allocation so GFP * flag is irrelevant */ xa_store(&ses->se_slots, i, xa_mk_value(slot->sl_seqid), 0); free_svc_cred(&slot->sl_cred); kfree(slot); } ses->se_fchannel.maxreqs = from; if (ses->se_target_maxslots > from) { int new_target = from ?: 1; atomic_sub(ses->se_target_maxslots - new_target, &nfsd_total_target_slots); ses->se_target_maxslots = new_target; } } /** * reduce_session_slots - reduce the target max-slots of a session if possible * @ses: The session to affect * @dec: how much to decrease the target by * * This interface can be used by a shrinker to reduce the target max-slots * for a session so that some slots can eventually be freed. * It uses spin_trylock() as it may be called in a context where another * spinlock is held that has a dependency on client_lock. As shrinkers are * best-effort, skiping a session is client_lock is already held has no * great coast * * Return value: * The number of slots that the target was reduced by. */ static int reduce_session_slots(struct nfsd4_session *ses, int dec) { struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id); int ret = 0; if (ses->se_target_maxslots <= 1) return ret; if (!spin_trylock(&nn->client_lock)) return ret; ret = min(dec, ses->se_target_maxslots-1); ses->se_target_maxslots -= ret; atomic_sub(ret, &nfsd_total_target_slots); ses->se_slot_gen += 1; if (ses->se_slot_gen == 0) { int i; ses->se_slot_gen = 1; for (i = 0; i < ses->se_fchannel.maxreqs; i++) { struct nfsd4_slot *slot = xa_load(&ses->se_slots, i); slot->sl_generation = 0; } } spin_unlock(&nn->client_lock); return ret; } /* * We don't actually need to cache the rpc and session headers, so we * can allocate a little less for each slot: */ static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) { u32 size; if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ) size = 0; else size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; return size + sizeof(struct nfsd4_slot); } static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, struct nfsd4_channel_attrs *battrs) { int numslots = fattrs->maxreqs; int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; struct nfsd4_slot *slot; int i; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; xa_init(&new->se_slots); /* allocate each struct nfsd4_slot and data cache in one piece */ slot = kzalloc(slotsize, GFP_KERNEL); if (!slot || xa_is_err(xa_store(&new->se_slots, 0, slot, GFP_KERNEL))) goto out_free; for (i = 1; i < numslots; i++) { const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; slot = kzalloc(slotsize, gfp); if (!slot) break; if (xa_is_err(xa_store(&new->se_slots, i, slot, gfp))) { kfree(slot); break; } } fattrs->maxreqs = i; memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs)); new->se_target_maxslots = i; atomic_add(i - 1, &nfsd_total_target_slots); new->se_cb_slot_avail = ~0U; new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_SIZE - 1); spin_lock_init(&new->se_lock); return new; out_free: kfree(slot); xa_destroy(&new->se_slots); kfree(new); return NULL; } static void free_conn(struct nfsd4_conn *c) { svc_xprt_put(c->cn_xprt); kfree(c); } static void nfsd4_conn_lost(struct svc_xpt_user *u) { struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user); struct nfs4_client *clp = c->cn_session->se_client; trace_nfsd_cb_lost(clp); spin_lock(&clp->cl_lock); if (!list_empty(&c->cn_persession)) { list_del(&c->cn_persession); free_conn(c); } nfsd4_probe_callback(clp); spin_unlock(&clp->cl_lock); } static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) { struct nfsd4_conn *conn; conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL); if (!conn) return NULL; svc_xprt_get(rqstp->rq_xprt); conn->cn_xprt = rqstp->rq_xprt; conn->cn_flags = flags; INIT_LIST_HEAD(&conn->cn_xpt_user.list); return conn; } static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) { conn->cn_session = ses; list_add(&conn->cn_persession, &ses->se_conns); } static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; spin_lock(&clp->cl_lock); __nfsd4_hash_conn(conn, ses); spin_unlock(&clp->cl_lock); } static int nfsd4_register_conn(struct nfsd4_conn *conn) { conn->cn_xpt_user.callback = nfsd4_conn_lost; return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); } static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, struct nfsd4_session *ses) { int ret; nfsd4_hash_conn(conn, ses); ret = nfsd4_register_conn(conn); if (ret) /* oops; xprt is already down: */ nfsd4_conn_lost(&conn->cn_xpt_user); /* We may have gained or lost a callback channel: */ nfsd4_probe_callback_sync(ses->se_client); } static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses) { u32 dir = NFS4_CDFC4_FORE; if (cses->flags & SESSION4_BACK_CHAN) dir |= NFS4_CDFC4_BACK; return alloc_conn(rqstp, dir); } /* must be called under client_lock */ static void nfsd4_del_conns(struct nfsd4_session *s) { struct nfs4_client *clp = s->se_client; struct nfsd4_conn *c; spin_lock(&clp->cl_lock); while (!list_empty(&s->se_conns)) { c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession); list_del_init(&c->cn_persession); spin_unlock(&clp->cl_lock); unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user); free_conn(c); spin_lock(&clp->cl_lock); } spin_unlock(&clp->cl_lock); } static void __free_session(struct nfsd4_session *ses) { free_session_slots(ses, 0); xa_destroy(&ses->se_slots); kfree(ses); } static void free_session(struct nfsd4_session *ses) { nfsd4_del_conns(ses); __free_session(ses); } static unsigned long nfsd_slot_count(struct shrinker *s, struct shrink_control *sc) { unsigned long cnt = atomic_read(&nfsd_total_target_slots); return cnt ? cnt : SHRINK_EMPTY; } static unsigned long nfsd_slot_scan(struct shrinker *s, struct shrink_control *sc) { struct nfsd4_session *ses; unsigned long scanned = 0; unsigned long freed = 0; spin_lock(&nfsd_session_list_lock); list_for_each_entry(ses, &nfsd_session_list, se_all_sessions) { freed += reduce_session_slots(ses, 1); scanned += 1; if (scanned >= sc->nr_to_scan) { /* Move starting point for next scan */ list_move(&nfsd_session_list, &ses->se_all_sessions); break; } } spin_unlock(&nfsd_session_list_lock); sc->nr_scanned = scanned; return freed; } static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) { int idx; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); new->se_client = clp; gen_sessionid(new); INIT_LIST_HEAD(&new->se_conns); atomic_set(&new->se_ref, 0); new->se_dead = false; new->se_cb_prog = cses->callback_prog; new->se_cb_sec = cses->cb_sec; for (idx = 0; idx < NFSD_BC_SLOT_TABLE_SIZE; ++idx) new->se_cb_seq_nr[idx] = 1; idx = hash_sessionid(&new->se_sessionid); list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); spin_lock(&clp->cl_lock); list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); spin_lock(&nfsd_session_list_lock); list_add_tail(&new->se_all_sessions, &nfsd_session_list); spin_unlock(&nfsd_session_list_lock); { struct sockaddr *sa = svc_addr(rqstp); /* * This is a little silly; with sessions there's no real * use for the callback address. Use the peer address * as a reasonable default for now, but consider fixing * the rpc client not to require an address in the * future: */ rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); } } /* caller must hold client_lock */ static struct nfsd4_session * __find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) { struct nfsd4_session *elem; int idx; struct nfsd_net *nn = net_generic(net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); dump_sessionid(__func__, sessionid); idx = hash_sessionid(sessionid); /* Search in the appropriate list */ list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) { if (!memcmp(elem->se_sessionid.data, sessionid->data, NFS4_MAX_SESSIONID_LEN)) { return elem; } } dprintk("%s: session not found\n", __func__); return NULL; } static struct nfsd4_session * find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net, __be32 *ret) { struct nfsd4_session *session; __be32 status = nfserr_badsession; session = __find_in_sessionid_hashtbl(sessionid, net); if (!session) goto out; status = nfsd4_get_session_locked(session); if (status) session = NULL; out: *ret = status; return session; } /* caller must hold client_lock */ static void unhash_session(struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); list_del(&ses->se_hash); spin_lock(&ses->se_client->cl_lock); list_del(&ses->se_perclnt); spin_unlock(&ses->se_client->cl_lock); spin_lock(&nfsd_session_list_lock); list_del(&ses->se_all_sessions); spin_unlock(&nfsd_session_list_lock); } /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ static int STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) { /* * We're assuming the clid was not given out from a boot * precisely 2^32 (about 136 years) before this one. That seems * a safe assumption: */ if (clid->cl_boot == (u32)nn->boot_time) return 0; trace_nfsd_clid_stale(clid); return 1; } static struct nfs4_client *alloc_client(struct xdr_netobj name, struct nfsd_net *nn) { struct nfs4_client *clp; int i; if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients && atomic_read(&nn->nfsd_courtesy_clients) > 0) mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); clp = kmem_cache_zalloc(client_slab, GFP_KERNEL); if (clp == NULL) return NULL; xdr_netobj_dup(&clp->cl_name, &name, GFP_KERNEL); if (clp->cl_name.data == NULL) goto err_no_name; clp->cl_ownerstr_hashtbl = kmalloc_array(OWNER_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!clp->cl_ownerstr_hashtbl) goto err_no_hashtbl; clp->cl_callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0); if (!clp->cl_callback_wq) goto err_no_callback_wq; for (i = 0; i < OWNER_HASH_SIZE; i++) INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]); INIT_LIST_HEAD(&clp->cl_sessions); idr_init(&clp->cl_stateids); atomic_set(&clp->cl_rpc_users, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; clp->cl_state = NFSD4_ACTIVE; atomic_inc(&nn->nfs4_client_count); atomic_set(&clp->cl_delegs_in_recall, 0); INIT_LIST_HEAD(&clp->cl_idhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); INIT_LIST_HEAD(&clp->cl_revoked); #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&clp->cl_lo_states); #endif INIT_LIST_HEAD(&clp->async_copies); spin_lock_init(&clp->async_lock); spin_lock_init(&clp->cl_lock); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; err_no_callback_wq: kfree(clp->cl_ownerstr_hashtbl); err_no_hashtbl: kfree(clp->cl_name.data); err_no_name: kmem_cache_free(client_slab, clp); return NULL; } static void __free_client(struct kref *k) { struct nfsdfs_client *c = container_of(k, struct nfsdfs_client, cl_ref); struct nfs4_client *clp = container_of(c, struct nfs4_client, cl_nfsdfs); free_svc_cred(&clp->cl_cred); destroy_workqueue(clp->cl_callback_wq); kfree(clp->cl_ownerstr_hashtbl); kfree(clp->cl_name.data); kfree(clp->cl_nii_domain.data); kfree(clp->cl_nii_name.data); idr_destroy(&clp->cl_stateids); kfree(clp->cl_ra); kmem_cache_free(client_slab, clp); } static void drop_client(struct nfs4_client *clp) { kref_put(&clp->cl_nfsdfs.cl_ref, __free_client); } static void free_client(struct nfs4_client *clp) { while (!list_empty(&clp->cl_sessions)) { struct nfsd4_session *ses; ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, se_perclnt); list_del(&ses->se_perclnt); WARN_ON_ONCE(atomic_read(&ses->se_ref)); free_session(ses); } rpc_destroy_wait_queue(&clp->cl_cb_waitq); if (clp->cl_nfsd_dentry) { nfsd_client_rmdir(clp->cl_nfsd_dentry); clp->cl_nfsd_dentry = NULL; wake_up_all(&expiry_wq); } drop_client(clp); } /* must be called under the client_lock */ static void unhash_client_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct nfsd4_session *ses; lockdep_assert_held(&nn->client_lock); /* Mark the client as expired! */ clp->cl_time = 0; /* Make it invisible */ if (!list_empty(&clp->cl_idhash)) { list_del_init(&clp->cl_idhash); if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) rb_erase(&clp->cl_namenode, &nn->conf_name_tree); else rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); } list_del_init(&clp->cl_lru); spin_lock(&clp->cl_lock); spin_lock(&nfsd_session_list_lock); list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) { list_del_init(&ses->se_hash); list_del_init(&ses->se_all_sessions); } spin_unlock(&nfsd_session_list_lock); spin_unlock(&clp->cl_lock); } static void unhash_client(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); spin_lock(&nn->client_lock); unhash_client_locked(clp); spin_unlock(&nn->client_lock); } static __be32 mark_client_expired_locked(struct nfs4_client *clp) { int users = atomic_read(&clp->cl_rpc_users); trace_nfsd_mark_client_expired(clp, users); if (users) return nfserr_jukebox; unhash_client_locked(clp); return nfs_ok; } static void __destroy_client(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); int i; struct nfs4_openowner *oo; struct nfs4_delegation *dp; LIST_HEAD(reaplist); spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); destroy_unhashed_deleg(dp); } while (!list_empty(&clp->cl_revoked)) { dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); nfs4_put_stid(&dp->dl_stid); } while (!list_empty(&clp->cl_openowners)) { oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); nfs4_get_stateowner(&oo->oo_owner); release_openowner(oo); } for (i = 0; i < OWNER_HASH_SIZE; i++) { struct nfs4_stateowner *so, *tmp; list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i], so_strhash) { /* Should be no openowners at this point */ WARN_ON_ONCE(so->so_is_open_owner); remove_blocked_locks(lockowner(so)); } } nfsd4_return_all_client_layouts(clp); nfsd4_shutdown_copy(clp); nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); atomic_add_unless(&nn->nfs4_client_count, -1, 0); nfsd4_dec_courtesy_client_count(nn, clp); free_client(clp); wake_up_all(&expiry_wq); } static void destroy_client(struct nfs4_client *clp) { unhash_client(clp); __destroy_client(clp); } static void inc_reclaim_complete(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (!nn->track_reclaim_completes) return; if (!nfsd4_find_reclaim_client(clp->cl_name, nn)) return; if (atomic_inc_return(&nn->nr_reclaim_complete) == nn->reclaim_str_hashtbl_size) { printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n", clp->net->ns.inum); nfsd4_end_grace(nn); } } static void expire_client(struct nfs4_client *clp) { unhash_client(clp); nfsd4_client_record_remove(clp); __destroy_client(clp); } static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) { memcpy(target->cl_verifier.data, source->data, sizeof(target->cl_verifier.data)); } static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) { target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; target->cl_clientid.cl_id = source->cl_clientid.cl_id; } static int copy_cred(struct svc_cred *target, struct svc_cred *source) { target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL); target->cr_raw_principal = kstrdup(source->cr_raw_principal, GFP_KERNEL); target->cr_targ_princ = kstrdup(source->cr_targ_princ, GFP_KERNEL); if ((source->cr_principal && !target->cr_principal) || (source->cr_raw_principal && !target->cr_raw_principal) || (source->cr_targ_princ && !target->cr_targ_princ)) return -ENOMEM; target->cr_flavor = source->cr_flavor; target->cr_uid = source->cr_uid; target->cr_gid = source->cr_gid; target->cr_group_info = source->cr_group_info; get_group_info(target->cr_group_info); target->cr_gss_mech = source->cr_gss_mech; if (source->cr_gss_mech) gss_mech_get(source->cr_gss_mech); return 0; } static int compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2) { if (o1->len < o2->len) return -1; if (o1->len > o2->len) return 1; return memcmp(o1->data, o2->data, o1->len); } static int same_verf(nfs4_verifier *v1, nfs4_verifier *v2) { return 0 == memcmp(v1->data, v2->data, sizeof(v1->data)); } static int same_clid(clientid_t *cl1, clientid_t *cl2) { return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id); } static bool groups_equal(struct group_info *g1, struct group_info *g2) { int i; if (g1->ngroups != g2->ngroups) return false; for (i=0; i<g1->ngroups; i++) if (!gid_eq(g1->gid[i], g2->gid[i])) return false; return true; } /* * RFC 3530 language requires clid_inuse be returned when the * "principal" associated with a requests differs from that previously * used. We use uid, gid's, and gss principal string as our best * approximation. We also don't want to allow non-gss use of a client * established using gss: in theory cr_principal should catch that * change, but in practice cr_principal can be null even in the gss case * since gssd doesn't always pass down a principal string. */ static bool is_gss_cred(struct svc_cred *cr) { /* Is cr_flavor one of the gss "pseudoflavors"?: */ return (cr->cr_flavor > RPC_AUTH_MAXFLAVOR); } static bool same_creds(struct svc_cred *cr1, struct svc_cred *cr2) { if ((is_gss_cred(cr1) != is_gss_cred(cr2)) || (!uid_eq(cr1->cr_uid, cr2->cr_uid)) || (!gid_eq(cr1->cr_gid, cr2->cr_gid)) || !groups_equal(cr1->cr_group_info, cr2->cr_group_info)) return false; /* XXX: check that cr_targ_princ fields match ? */ if (cr1->cr_principal == cr2->cr_principal) return true; if (!cr1->cr_principal || !cr2->cr_principal) return false; return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); } static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp) { struct svc_cred *cr = &rqstp->rq_cred; u32 service; if (!cr->cr_gss_mech) return false; service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor); return service == RPC_GSS_SVC_INTEGRITY || service == RPC_GSS_SVC_PRIVACY; } bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) { struct svc_cred *cr = &rqstp->rq_cred; if (!cl->cl_mach_cred) return true; if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech) return false; if (!svc_rqst_integrity_protected(rqstp)) return false; if (cl->cl_cred.cr_raw_principal) return 0 == strcmp(cl->cl_cred.cr_raw_principal, cr->cr_raw_principal); if (!cr->cr_principal) return false; return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal); } static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn) { __be32 verf[2]; /* * This is opaque to client, so no need to byte-swap. Use * __force to keep sparse happy */ verf[0] = (__force __be32)(u32)ktime_get_real_seconds(); verf[1] = (__force __be32)nn->clverifier_counter++; memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); } static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) { clp->cl_clientid.cl_boot = (u32)nn->boot_time; clp->cl_clientid.cl_id = nn->clientid_counter++; gen_confirm(clp, nn); } static struct nfs4_stid * find_stateid_locked(struct nfs4_client *cl, stateid_t *t) { struct nfs4_stid *ret; ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id); if (!ret || !ret->sc_type) return NULL; return ret; } static struct nfs4_stid * find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, unsigned short typemask, unsigned short ok_states) { struct nfs4_stid *s; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, t); if (s != NULL) { if ((s->sc_status & ~ok_states) == 0 && (typemask & s->sc_type)) refcount_inc(&s->sc_count); else s = NULL; } spin_unlock(&cl->cl_lock); return s; } static struct nfs4_client *get_nfsdfs_clp(struct inode *inode) { struct nfsdfs_client *nc; nc = get_nfsdfs_client(inode); if (!nc) return NULL; return container_of(nc, struct nfs4_client, cl_nfsdfs); } static void seq_quote_mem(struct seq_file *m, char *data, int len) { seq_puts(m, "\""); seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\"); seq_puts(m, "\""); } static const char *cb_state2str(int state) { switch (state) { case NFSD4_CB_UP: return "UP"; case NFSD4_CB_UNKNOWN: return "UNKNOWN"; case NFSD4_CB_DOWN: return "DOWN"; case NFSD4_CB_FAULT: return "FAULT"; } return "UNDEFINED"; } static int client_info_show(struct seq_file *m, void *v) { struct inode *inode = file_inode(m->file); struct nfsd4_session *ses; struct nfs4_client *clp; u64 clid; clp = get_nfsdfs_clp(inode); if (!clp) return -ENXIO; memcpy(&clid, &clp->cl_clientid, sizeof(clid)); seq_printf(m, "clientid: 0x%llx\n", clid); seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr); if (clp->cl_state == NFSD4_COURTESY) seq_puts(m, "status: courtesy\n"); else if (clp->cl_state == NFSD4_EXPIRABLE) seq_puts(m, "status: expirable\n"); else if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) seq_puts(m, "status: confirmed\n"); else seq_puts(m, "status: unconfirmed\n"); seq_printf(m, "seconds from last renew: %lld\n", ktime_get_boottime_seconds() - clp->cl_time); seq_puts(m, "name: "); seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); if (clp->cl_nii_domain.data) { seq_puts(m, "Implementation domain: "); seq_quote_mem(m, clp->cl_nii_domain.data, clp->cl_nii_domain.len); seq_puts(m, "\nImplementation name: "); seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len); seq_printf(m, "\nImplementation time: [%lld, %ld]\n", clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec); } seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state)); seq_printf(m, "callback address: \"%pISpc\"\n", &clp->cl_cb_conn.cb_addr); seq_printf(m, "admin-revoked states: %d\n", atomic_read(&clp->cl_admin_revoked)); spin_lock(&clp->cl_lock); seq_printf(m, "session slots:"); list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) seq_printf(m, " %u", ses->se_fchannel.maxreqs); seq_printf(m, "\nsession target slots:"); list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) seq_printf(m, " %u", ses->se_target_maxslots); spin_unlock(&clp->cl_lock); seq_puts(m, "\n"); drop_client(clp); return 0; } DEFINE_SHOW_ATTRIBUTE(client_info); static void *states_start(struct seq_file *s, loff_t *pos) __acquires(&clp->cl_lock) { struct nfs4_client *clp = s->private; unsigned long id = *pos; void *ret; spin_lock(&clp->cl_lock); ret = idr_get_next_ul(&clp->cl_stateids, &id); *pos = id; return ret; } static void *states_next(struct seq_file *s, void *v, loff_t *pos) { struct nfs4_client *clp = s->private; unsigned long id = *pos; void *ret; id = *pos; id++; ret = idr_get_next_ul(&clp->cl_stateids, &id); *pos = id; return ret; } static void states_stop(struct seq_file *s, void *v) __releases(&clp->cl_lock) { struct nfs4_client *clp = s->private; spin_unlock(&clp->cl_lock); } static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f) { seq_printf(s, "filename: \"%pD2\"", f->nf_file); } static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) { struct inode *inode = file_inode(f->nf_file); seq_printf(s, "superblock: \"%02x:%02x:%ld\"", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); } static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo) { seq_puts(s, "owner: "); seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len); } static void nfs4_show_stateid(struct seq_file *s, stateid_t *stid) { seq_printf(s, "0x%.8x", stid->si_generation); seq_printf(s, "%12phN", &stid->si_opaque); } static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; struct nfsd_file *file; struct nfs4_stateowner *oo; unsigned int access, deny; ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); seq_puts(s, ": { type: open, "); access = bmap_to_share_mode(ols->st_access_bmap); deny = bmap_to_share_mode(ols->st_deny_bmap); seq_printf(s, "access: %s%s, ", access & NFS4_SHARE_ACCESS_READ ? "r" : "-", access & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); seq_printf(s, "deny: %s%s, ", deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); if (nf) { spin_lock(&nf->fi_lock); file = find_any_file_locked(nf); if (file) { nfs4_show_superblock(s, file); seq_puts(s, ", "); nfs4_show_fname(s, file); seq_puts(s, ", "); } spin_unlock(&nf->fi_lock); } else seq_puts(s, "closed, "); nfs4_show_owner(s, oo); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); return 0; } static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; struct nfsd_file *file; struct nfs4_stateowner *oo; ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); seq_puts(s, ": { type: lock, "); spin_lock(&nf->fi_lock); file = find_any_file_locked(nf); if (file) { /* * Note: a lock stateid isn't really the same thing as a lock, * it's the locking state held by one owner on a file, and there * may be multiple (or no) lock ranges associated with it. * (Same for the matter is true of open stateids.) */ nfs4_show_superblock(s, file); /* XXX: open stateid? */ seq_puts(s, ", "); nfs4_show_fname(s, file); seq_puts(s, ", "); } nfs4_show_owner(s, oo); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); spin_unlock(&nf->fi_lock); return 0; } static char *nfs4_show_deleg_type(u32 dl_type) { switch (dl_type) { case OPEN_DELEGATE_READ: return "r"; case OPEN_DELEGATE_WRITE: return "w"; case OPEN_DELEGATE_READ_ATTRS_DELEG: return "ra"; case OPEN_DELEGATE_WRITE_ATTRS_DELEG: return "wa"; } return "?"; } static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_delegation *ds; struct nfs4_file *nf; struct nfsd_file *file; ds = delegstateid(st); nf = st->sc_file; seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); seq_puts(s, ": { type: deleg, "); seq_printf(s, "access: %s", nfs4_show_deleg_type(ds->dl_type)); /* XXX: lease time, whether it's being recalled. */ spin_lock(&nf->fi_lock); file = nf->fi_deleg_file; if (file) { seq_puts(s, ", "); nfs4_show_superblock(s, file); seq_puts(s, ", "); nfs4_show_fname(s, file); } spin_unlock(&nf->fi_lock); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); return 0; } static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_layout_stateid *ls; struct nfsd_file *file; ls = container_of(st, struct nfs4_layout_stateid, ls_stid); seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); seq_puts(s, ": { type: layout"); /* XXX: What else would be useful? */ spin_lock(&ls->ls_stid.sc_file->fi_lock); file = ls->ls_file; if (file) { seq_puts(s, ", "); nfs4_show_superblock(s, file); seq_puts(s, ", "); nfs4_show_fname(s, file); } spin_unlock(&ls->ls_stid.sc_file->fi_lock); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); return 0; } static int states_show(struct seq_file *s, void *v) { struct nfs4_stid *st = v; switch (st->sc_type) { case SC_TYPE_OPEN: return nfs4_show_open(s, st); case SC_TYPE_LOCK: return nfs4_show_lock(s, st); case SC_TYPE_DELEG: return nfs4_show_deleg(s, st); case SC_TYPE_LAYOUT: return nfs4_show_layout(s, st); default: return 0; /* XXX: or SEQ_SKIP? */ } /* XXX: copy stateids? */ } static struct seq_operations states_seq_ops = { .start = states_start, .next = states_next, .stop = states_stop, .show = states_show }; static int client_states_open(struct inode *inode, struct file *file) { struct seq_file *s; struct nfs4_client *clp; int ret; clp = get_nfsdfs_clp(inode); if (!clp) return -ENXIO; ret = seq_open(file, &states_seq_ops); if (ret) return ret; s = file->private_data; s->private = clp; return 0; } static int client_opens_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct nfs4_client *clp = m->private; /* XXX: alternatively, we could get/drop in seq start/stop */ drop_client(clp); return seq_release(inode, file); } static const struct file_operations client_states_fops = { .open = client_states_open, .read = seq_read, .llseek = seq_lseek, .release = client_opens_release, }; /* * Normally we refuse to destroy clients that are in use, but here the * administrator is telling us to just do it. We also want to wait * so the caller has a guarantee that the client's locks are gone by * the time the write returns: */ static void force_expire_client(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); bool already_expired; trace_nfsd_clid_admin_expired(&clp->cl_clientid); spin_lock(&nn->client_lock); clp->cl_time = 0; spin_unlock(&nn->client_lock); wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0); spin_lock(&nn->client_lock); already_expired = list_empty(&clp->cl_lru); if (!already_expired) unhash_client_locked(clp); spin_unlock(&nn->client_lock); if (!already_expired) expire_client(clp); else wait_event(expiry_wq, clp->cl_nfsd_dentry == NULL); } static ssize_t client_ctl_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) { char *data; struct nfs4_client *clp; data = simple_transaction_get(file, buf, size); if (IS_ERR(data)) return PTR_ERR(data); if (size != 7 || 0 != memcmp(data, "expire\n", 7)) return -EINVAL; clp = get_nfsdfs_clp(file_inode(file)); if (!clp) return -ENXIO; force_expire_client(clp); drop_client(clp); return 7; } static const struct file_operations client_ctl_fops = { .write = client_ctl_write, .release = simple_transaction_release, }; static const struct tree_descr client_files[] = { [0] = {"info", &client_info_fops, S_IRUSR}, [1] = {"states", &client_states_fops, S_IRUSR}, [2] = {"ctl", &client_ctl_fops, S_IWUSR}, [3] = {""}, }; static int nfsd4_cb_recall_any_done(struct nfsd4_callback *cb, struct rpc_task *task) { trace_nfsd_cb_recall_any_done(cb, task); switch (task->tk_status) { case -NFS4ERR_DELAY: rpc_delay(task, 2 * HZ); return 0; default: return 1; } } static void nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); drop_client(clp); } static int nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task) { struct nfs4_cb_fattr *ncf = container_of(cb, struct nfs4_cb_fattr, ncf_getattr); struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); trace_nfsd_cb_getattr_done(&dp->dl_stid.sc_stateid, task); ncf->ncf_cb_status = task->tk_status; switch (task->tk_status) { case -NFS4ERR_DELAY: rpc_delay(task, 2 * HZ); return 0; default: return 1; } } static void nfsd4_cb_getattr_release(struct nfsd4_callback *cb) { struct nfs4_cb_fattr *ncf = container_of(cb, struct nfs4_cb_fattr, ncf_getattr); struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); clear_and_wake_up_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags); nfs4_put_stid(&dp->dl_stid); } static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { .done = nfsd4_cb_recall_any_done, .release = nfsd4_cb_recall_any_release, .opcode = OP_CB_RECALL_ANY, }; static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = { .done = nfsd4_cb_getattr_done, .release = nfsd4_cb_getattr_release, .opcode = OP_CB_GETATTR, }; static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf) { struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags)) return; /* set to proper status when nfsd4_cb_getattr_done runs */ ncf->ncf_cb_status = NFS4ERR_IO; refcount_inc(&dp->dl_stid.sc_count); nfsd4_run_cb(&ncf->ncf_getattr); } static struct nfs4_client *create_client(struct xdr_netobj name, struct svc_rqst *rqstp, nfs4_verifier *verf) { struct nfs4_client *clp; struct sockaddr *sa = svc_addr(rqstp); int ret; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct dentry *dentries[ARRAY_SIZE(client_files)]; clp = alloc_client(name, nn); if (clp == NULL) return NULL; ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); if (ret) { free_client(clp); return NULL; } gen_clid(clp, nn); kref_init(&clp->cl_nfsdfs.cl_ref); nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL); clp->cl_time = ktime_get_boottime_seconds(); copy_verf(clp, verf); memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage)); clp->cl_cb_session = NULL; clp->net = net; clp->cl_nfsd_dentry = nfsd_client_mkdir( nn, &clp->cl_nfsdfs, clp->cl_clientid.cl_id - nn->clientid_base, client_files, dentries); clp->cl_nfsd_info_dentry = dentries[0]; if (!clp->cl_nfsd_dentry) { free_client(clp); return NULL; } clp->cl_ra = kzalloc(sizeof(*clp->cl_ra), GFP_KERNEL); if (!clp->cl_ra) { free_client(clp); return NULL; } clp->cl_ra_time = 0; nfsd4_init_cb(&clp->cl_ra->ra_cb, clp, &nfsd4_cb_recall_any_ops, NFSPROC4_CLNT_CB_RECALL_ANY); return clp; } static void add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root) { struct rb_node **new = &(root->rb_node), *parent = NULL; struct nfs4_client *clp; while (*new) { clp = rb_entry(*new, struct nfs4_client, cl_namenode); parent = *new; if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0) new = &((*new)->rb_left); else new = &((*new)->rb_right); } rb_link_node(&new_clp->cl_namenode, parent, new); rb_insert_color(&new_clp->cl_namenode, root); } static struct nfs4_client * find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root) { int cmp; struct rb_node *node = root->rb_node; struct nfs4_client *clp; while (node) { clp = rb_entry(node, struct nfs4_client, cl_namenode); cmp = compare_blob(&clp->cl_name, name); if (cmp > 0) node = node->rb_left; else if (cmp < 0) node = node->rb_right; else return clp; } return NULL; } static void add_to_unconfirmed(struct nfs4_client *clp) { unsigned int idhashval; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); add_clp_to_name_tree(clp, &nn->unconf_name_tree); idhashval = clientid_hashval(clp->cl_clientid.cl_id); list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]); renew_client_locked(clp); } static void move_to_confirmed(struct nfs4_client *clp) { unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); add_clp_to_name_tree(clp, &nn->conf_name_tree); set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); trace_nfsd_clid_confirmed(&clp->cl_clientid); renew_client_locked(clp); } static struct nfs4_client * find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions) { struct nfs4_client *clp; unsigned int idhashval = clientid_hashval(clid->cl_id); list_for_each_entry(clp, &tbl[idhashval], cl_idhash) { if (same_clid(&clp->cl_clientid, clid)) { if ((bool)clp->cl_minorversion != sessions) return NULL; renew_client_locked(clp); return clp; } } return NULL; } static struct nfs4_client * find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct list_head *tbl = nn->conf_id_hashtbl; lockdep_assert_held(&nn->client_lock); return find_client_in_id_table(tbl, clid, sessions); } static struct nfs4_client * find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct list_head *tbl = nn->unconf_id_hashtbl; lockdep_assert_held(&nn->client_lock); return find_client_in_id_table(tbl, clid, sessions); } static bool clp_used_exchangeid(struct nfs4_client *clp) { return clp->cl_exchange_flags != 0; } static struct nfs4_client * find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) { lockdep_assert_held(&nn->client_lock); return find_clp_in_name_tree(name, &nn->conf_name_tree); } static struct nfs4_client * find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) { lockdep_assert_held(&nn->client_lock); return find_clp_in_name_tree(name, &nn->unconf_name_tree); } static void gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) { struct nfs4_cb_conn *conn = &clp->cl_cb_conn; struct sockaddr *sa = svc_addr(rqstp); u32 scopeid = rpc_get_scope_id(sa); unsigned short expected_family; /* Currently, we only support tcp and tcp6 for the callback channel */ if (se->se_callback_netid_len == 3 && !memcmp(se->se_callback_netid_val, "tcp", 3)) expected_family = AF_INET; else if (se->se_callback_netid_len == 4 && !memcmp(se->se_callback_netid_val, "tcp6", 4)) expected_family = AF_INET6; else goto out_err; conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val, se->se_callback_addr_len, (struct sockaddr *)&conn->cb_addr, sizeof(conn->cb_addr)); if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family) goto out_err; if (conn->cb_addr.ss_family == AF_INET6) ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid; conn->cb_prog = se->se_callback_prog; conn->cb_ident = se->se_callback_ident; memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen); trace_nfsd_cb_args(clp, conn); return; out_err: conn->cb_addr.ss_family = AF_UNSPEC; conn->cb_addrlen = 0; trace_nfsd_cb_nodelegs(clp); return; } /* * Cache a reply. nfsd4_check_resp_size() has bounded the cache size. */ static void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { struct xdr_buf *buf = resp->xdr->buf; struct nfsd4_slot *slot = resp->cstate.slot; unsigned int base; dprintk("--> %s slot %p\n", __func__, slot); slot->sl_flags |= NFSD4_SLOT_INITIALIZED; slot->sl_opcnt = resp->opcnt; slot->sl_status = resp->cstate.status; free_svc_cred(&slot->sl_cred); copy_cred(&slot->sl_cred, &resp->rqstp->rq_cred); if (!nfsd4_cache_this(resp)) { slot->sl_flags &= ~NFSD4_SLOT_CACHED; return; } slot->sl_flags |= NFSD4_SLOT_CACHED; base = resp->cstate.data_offset; slot->sl_datalen = buf->len - base; if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) WARN(1, "%s: sessions DRC could not cache compound\n", __func__); return; } /* * Encode the replay sequence operation from the slot values. * If cachethis is FALSE encode the uncached rep error on the next * operation which sets resp->p and increments resp->opcnt for * nfs4svc_encode_compoundres. * */ static __be32 nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, struct nfsd4_compoundres *resp) { struct nfsd4_op *op; struct nfsd4_slot *slot = resp->cstate.slot; /* Encode the replayed sequence operation */ op = &args->ops[resp->opcnt - 1]; nfsd4_encode_operation(resp, op); if (slot->sl_flags & NFSD4_SLOT_CACHED) return op->status; if (args->opcnt == 1) { /* * The original operation wasn't a solo sequence--we * always cache those--so this retry must not match the * original: */ op->status = nfserr_seq_false_retry; } else { op = &args->ops[resp->opcnt++]; op->status = nfserr_retry_uncached_rep; nfsd4_encode_operation(resp, op); } return op->status; } /* * The sequence operation is not cached because we can use the slot and * session values. */ static __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { struct nfsd4_slot *slot = resp->cstate.slot; struct xdr_stream *xdr = resp->xdr; __be32 *p; __be32 status; dprintk("--> %s slot %p\n", __func__, slot); status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); if (status) return status; p = xdr_reserve_space(xdr, slot->sl_datalen); if (!p) { WARN_ON_ONCE(1); return nfserr_serverfault; } xdr_encode_opaque_fixed(p, slot->sl_data, slot->sl_datalen); xdr_commit_encode(xdr); resp->opcnt = slot->sl_opcnt; return slot->sl_status; } /* * Set the exchange_id flags returned by the server. */ static void nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) { #ifdef CONFIG_NFSD_PNFS new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS; #else new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; #endif /* Referrals are supported, Migration is not. */ new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; /* set the wire flags to return to client. */ clid->flags = new->cl_exchange_flags; } static bool client_has_openowners(struct nfs4_client *clp) { struct nfs4_openowner *oo; list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) { if (!list_empty(&oo->oo_owner.so_stateids)) return true; } return false; } static bool client_has_state(struct nfs4_client *clp) { return client_has_openowners(clp) #ifdef CONFIG_NFSD_PNFS || !list_empty(&clp->cl_lo_states) #endif || !list_empty(&clp->cl_delegations) || !list_empty(&clp->cl_sessions) || nfsd4_has_active_async_copies(clp); } static __be32 copy_impl_id(struct nfs4_client *clp, struct nfsd4_exchange_id *exid) { if (!exid->nii_domain.data) return 0; xdr_netobj_dup(&clp->cl_nii_domain, &exid->nii_domain, GFP_KERNEL); if (!clp->cl_nii_domain.data) return nfserr_jukebox; xdr_netobj_dup(&clp->cl_nii_name, &exid->nii_name, GFP_KERNEL); if (!clp->cl_nii_name.data) return nfserr_jukebox; clp->cl_nii_time = exid->nii_time; return 0; } __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_exchange_id *exid = &u->exchange_id; struct nfs4_client *conf, *new; struct nfs4_client *unconf = NULL; __be32 status; char addr_str[INET6_ADDRSTRLEN]; nfs4_verifier verf = exid->verifier; struct sockaddr *sa = svc_addr(rqstp); bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); rpc_ntop(sa, addr_str, sizeof(addr_str)); dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " "ip_addr=%s flags %x, spa_how %u\n", __func__, rqstp, exid, exid->clname.len, exid->clname.data, addr_str, exid->flags, exid->spa_how); exid->server_impl_name = kasprintf(GFP_KERNEL, "%s %s %s %s", utsname()->sysname, utsname()->release, utsname()->version, utsname()->machine); if (!exid->server_impl_name) return nfserr_jukebox; if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; new = create_client(exid->clname, rqstp, &verf); if (new == NULL) return nfserr_jukebox; status = copy_impl_id(new, exid); if (status) goto out_nolock; switch (exid->spa_how) { case SP4_MACH_CRED: exid->spo_must_enforce[0] = 0; exid->spo_must_enforce[1] = ( 1 << (OP_BIND_CONN_TO_SESSION - 32) | 1 << (OP_EXCHANGE_ID - 32) | 1 << (OP_CREATE_SESSION - 32) | 1 << (OP_DESTROY_SESSION - 32) | 1 << (OP_DESTROY_CLIENTID - 32)); exid->spo_must_allow[0] &= (1 << (OP_CLOSE) | 1 << (OP_OPEN_DOWNGRADE) | 1 << (OP_LOCKU) | 1 << (OP_DELEGRETURN)); exid->spo_must_allow[1] &= ( 1 << (OP_TEST_STATEID - 32) | 1 << (OP_FREE_STATEID - 32)); if (!svc_rqst_integrity_protected(rqstp)) { status = nfserr_inval; goto out_nolock; } /* * Sometimes userspace doesn't give us a principal. * Which is a bug, really. Anyway, we can't enforce * MACH_CRED in that case, better to give up now: */ if (!new->cl_cred.cr_principal && !new->cl_cred.cr_raw_principal) { status = nfserr_serverfault; goto out_nolock; } new->cl_mach_cred = true; break; case SP4_NONE: break; default: /* checked by xdr code */ WARN_ON_ONCE(1); fallthrough; case SP4_SSV: status = nfserr_encr_alg_unsupp; goto out_nolock; } /* Cases below refer to rfc 5661 section 18.35.4: */ spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&exid->clname, nn); if (conf) { bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); bool verfs_match = same_verf(&verf, &conf->cl_verifier); if (update) { if (!clp_used_exchangeid(conf)) { /* buggy client */ status = nfserr_inval; goto out; } if (!nfsd4_mach_creds_match(conf, rqstp)) { status = nfserr_wrong_cred; goto out; } if (!creds_match) { /* case 9 */ status = nfserr_perm; goto out; } if (!verfs_match) { /* case 8 */ status = nfserr_not_same; goto out; } /* case 6 */ exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; trace_nfsd_clid_confirmed_r(conf); goto out_copy; } if (!creds_match) { /* case 3 */ if (client_has_state(conf)) { status = nfserr_clid_inuse; trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } goto out_new; } if (verfs_match) { /* case 2 */ conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; trace_nfsd_clid_confirmed_r(conf); goto out_copy; } /* case 5, client reboot */ trace_nfsd_clid_verf_mismatch(conf, rqstp, &verf); conf = NULL; goto out_new; } if (update) { /* case 7 */ status = nfserr_noent; goto out; } unconf = find_unconfirmed_client_by_name(&exid->clname, nn); if (unconf) /* case 4, possible retry or client restart */ unhash_client_locked(unconf); /* case 1, new owner ID */ trace_nfsd_clid_fresh(new); out_new: if (conf) { status = mark_client_expired_locked(conf); if (status) goto out; trace_nfsd_clid_replaced(&conf->cl_clientid); } new->cl_minorversion = cstate->minorversion; new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0]; new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1]; /* Contrived initial CREATE_SESSION response */ new->cl_cs_slot.sl_status = nfserr_seq_misordered; add_to_unconfirmed(new); swap(new, conf); out_copy: exid->clientid.cl_boot = conf->cl_clientid.cl_boot; exid->clientid.cl_id = conf->cl_clientid.cl_id; exid->seqid = conf->cl_cs_slot.sl_seqid + 1; nfsd4_set_ex_flags(conf, exid); exid->nii_domain.len = sizeof("kernel.org") - 1; exid->nii_domain.data = "kernel.org"; /* * Note that RFC 8881 places no length limit on * nii_name, but this implementation permits no * more than NFS4_OPAQUE_LIMIT bytes. */ exid->nii_name.len = strlen(exid->server_impl_name); if (exid->nii_name.len > NFS4_OPAQUE_LIMIT) exid->nii_name.len = NFS4_OPAQUE_LIMIT; exid->nii_name.data = exid->server_impl_name; /* just send zeros - the date is in nii_name */ exid->nii_time.tv_sec = 0; exid->nii_time.tv_nsec = 0; dprintk("nfsd4_exchange_id seqid %d flags %x\n", conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags); status = nfs_ok; out: spin_unlock(&nn->client_lock); out_nolock: if (new) expire_client(new); if (unconf) { trace_nfsd_clid_expire_unconf(&unconf->cl_clientid); expire_client(unconf); } return status; } void nfsd4_exchange_id_release(union nfsd4_op_u *u) { struct nfsd4_exchange_id *exid = &u->exchange_id; kfree(exid->server_impl_name); } static __be32 check_slot_seqid(u32 seqid, u32 slot_seqid, u8 flags) { /* The slot is in use, and no response has been sent. */ if (flags & NFSD4_SLOT_INUSE) { if (seqid == slot_seqid) return nfserr_jukebox; else return nfserr_seq_misordered; } /* Note unsigned 32-bit arithmetic handles wraparound: */ if (likely(seqid == slot_seqid + 1)) return nfs_ok; if ((flags & NFSD4_SLOT_REUSED) && seqid == 1) return nfs_ok; if (seqid == slot_seqid) return nfserr_replay_cache; return nfserr_seq_misordered; } /* * Cache the create session result into the create session single DRC * slot cache by saving the xdr structure. sl_seqid has been set. * Do this for solo or embedded create session operations. */ static void nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses, struct nfsd4_clid_slot *slot, __be32 nfserr) { slot->sl_status = nfserr; memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses)); } static __be32 nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, struct nfsd4_clid_slot *slot) { memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses)); return slot->sl_status; } #define NFSD_MIN_REQ_HDR_SEQ_SZ ((\ 2 * 2 + /* credential,verifier: AUTH_NULL, length 0 */ \ 1 + /* MIN tag is length with zero, only length */ \ 3 + /* version, opcount, opcode */ \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ /* seqid, slotID, slotID, cache */ \ 4 ) * sizeof(__be32)) #define NFSD_MIN_RESP_HDR_SEQ_SZ ((\ 2 + /* verifier: AUTH_NULL, length 0 */\ 1 + /* status */ \ 1 + /* MIN tag is length with zero, only length */ \ 3 + /* opcount, opcode, opstatus*/ \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ /* seqid, slotID, slotID, slotID, status */ \ 5 ) * sizeof(__be32)) static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) { u32 maxrpc = nn->nfsd_serv->sv_max_mesg; if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ) return nfserr_toosmall; if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ) return nfserr_toosmall; ca->headerpadsz = 0; ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc); ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc); ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND); ca->maxresp_cached = min_t(u32, ca->maxresp_cached, NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ); ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION); return nfs_ok; } /* * Server's NFSv4.1 backchannel support is AUTH_SYS-only for now. * These are based on similar macros in linux/sunrpc/msg_prot.h . */ #define RPC_MAX_HEADER_WITH_AUTH_SYS \ (RPC_CALLHDRSIZE + 2 * (2 + UNX_CALLSLACK)) #define RPC_MAX_REPHEADER_WITH_AUTH_SYS \ (RPC_REPHDRSIZE + (2 + NUL_REPLYSLACK)) #define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \ RPC_MAX_HEADER_WITH_AUTH_SYS) * sizeof(__be32)) #define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \ RPC_MAX_REPHEADER_WITH_AUTH_SYS) * \ sizeof(__be32)) static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) { ca->headerpadsz = 0; if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ) return nfserr_toosmall; if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ) return nfserr_toosmall; ca->maxresp_cached = 0; if (ca->maxops < 2) return nfserr_toosmall; return nfs_ok; } static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs) { switch (cbs->flavor) { case RPC_AUTH_NULL: case RPC_AUTH_UNIX: return nfs_ok; default: /* * GSS case: the spec doesn't allow us to return this * error. But it also doesn't allow us not to support * GSS. * I'd rather this fail hard than return some error the * client might think it can already handle: */ return nfserr_encr_alg_unsupp; } } __be32 nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_create_session *cr_ses = &u->create_session; struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; struct nfsd4_clid_slot *cs_slot; struct nfs4_client *old = NULL; struct nfsd4_session *new; struct nfsd4_conn *conn; __be32 status = 0; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) return nfserr_inval; status = nfsd4_check_cb_sec(&cr_ses->cb_sec); if (status) return status; status = check_forechannel_attrs(&cr_ses->fore_channel, nn); if (status) return status; status = check_backchannel_attrs(&cr_ses->back_channel); if (status) goto out_err; status = nfserr_jukebox; new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel); if (!new) goto out_err; conn = alloc_conn_from_crses(rqstp, cr_ses); if (!conn) goto out_free_session; spin_lock(&nn->client_lock); /* RFC 8881 Section 18.36.4 Phase 1: Client record look-up. */ unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); conf = find_confirmed_client(&cr_ses->clientid, true, nn); if (!conf && !unconf) { status = nfserr_stale_clientid; goto out_free_conn; } /* RFC 8881 Section 18.36.4 Phase 2: Sequence ID processing. */ if (conf) { cs_slot = &conf->cl_cs_slot; trace_nfsd_slot_seqid_conf(conf, cr_ses); } else { cs_slot = &unconf->cl_cs_slot; trace_nfsd_slot_seqid_unconf(unconf, cr_ses); } status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); switch (status) { case nfs_ok: cs_slot->sl_seqid++; cr_ses->seqid = cs_slot->sl_seqid; break; case nfserr_replay_cache: status = nfsd4_replay_create_session(cr_ses, cs_slot); fallthrough; case nfserr_jukebox: /* The server MUST NOT cache NFS4ERR_DELAY */ goto out_free_conn; default: goto out_cache_error; } /* RFC 8881 Section 18.36.4 Phase 3: Client ID confirmation. */ if (conf) { status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(conf, rqstp)) goto out_cache_error; } else { status = nfserr_clid_inuse; if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { trace_nfsd_clid_cred_mismatch(unconf, rqstp); goto out_cache_error; } status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(unconf, rqstp)) goto out_cache_error; old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = mark_client_expired_locked(old); if (status) goto out_expired_error; trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; } /* RFC 8881 Section 18.36.4 Phase 4: Session creation. */ status = nfs_ok; /* Persistent sessions are not supported */ cr_ses->flags &= ~SESSION4_PERSIST; /* Upshifting from TCP to RDMA is not supported */ cr_ses->flags &= ~SESSION4_RDMA; /* Report the correct number of backchannel slots */ cr_ses->back_channel.maxreqs = new->se_cb_highest_slot + 1; init_session(rqstp, new, conf, cr_ses); nfsd4_get_session_locked(new); memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); /* cache solo and embedded create sessions under the client_lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); spin_unlock(&nn->client_lock); if (conf == unconf) fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); /* init connection and backchannel */ nfsd4_init_conn(rqstp, conn, new); nfsd4_put_session(new); if (old) expire_client(old); return status; out_expired_error: /* * Revert the slot seq_nr change so the server will process * the client's resend instead of returning a cached response. */ if (status == nfserr_jukebox) { cs_slot->sl_seqid--; cr_ses->seqid = cs_slot->sl_seqid; goto out_free_conn; } out_cache_error: nfsd4_cache_create_session(cr_ses, cs_slot, status); out_free_conn: spin_unlock(&nn->client_lock); free_conn(conn); out_free_session: __free_session(new); out_err: return status; } static __be32 nfsd4_map_bcts_dir(u32 *dir) { switch (*dir) { case NFS4_CDFC4_FORE: case NFS4_CDFC4_BACK: return nfs_ok; case NFS4_CDFC4_FORE_OR_BOTH: case NFS4_CDFC4_BACK_OR_BOTH: *dir = NFS4_CDFC4_BOTH; return nfs_ok; } return nfserr_inval; } __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl; struct nfsd4_session *session = cstate->session; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); __be32 status; status = nfsd4_check_cb_sec(&bc->bc_cb_sec); if (status) return status; spin_lock(&nn->client_lock); session->se_cb_prog = bc->bc_cb_program; session->se_cb_sec = bc->bc_cb_sec; spin_unlock(&nn->client_lock); nfsd4_probe_callback(session->se_client); return nfs_ok; } static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s) { struct nfsd4_conn *c; list_for_each_entry(c, &s->se_conns, cn_persession) { if (c->cn_xprt == xpt) { return c; } } return NULL; } static __be32 nfsd4_match_existing_connection(struct svc_rqst *rqst, struct nfsd4_session *session, u32 req, struct nfsd4_conn **conn) { struct nfs4_client *clp = session->se_client; struct svc_xprt *xpt = rqst->rq_xprt; struct nfsd4_conn *c; __be32 status; /* Following the last paragraph of RFC 5661 Section 18.34.3: */ spin_lock(&clp->cl_lock); c = __nfsd4_find_conn(xpt, session); if (!c) status = nfserr_noent; else if (req == c->cn_flags) status = nfs_ok; else if (req == NFS4_CDFC4_FORE_OR_BOTH && c->cn_flags != NFS4_CDFC4_BACK) status = nfs_ok; else if (req == NFS4_CDFC4_BACK_OR_BOTH && c->cn_flags != NFS4_CDFC4_FORE) status = nfs_ok; else status = nfserr_inval; spin_unlock(&clp->cl_lock); if (status == nfs_ok && conn) *conn = c; return status; } __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; __be32 status; struct nfsd4_conn *conn; struct nfsd4_session *session; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nfsd4_last_compound_op(rqstp)) return nfserr_not_only_op; spin_lock(&nn->client_lock); session = find_in_sessionid_hashtbl(&bcts->sessionid, net, &status); spin_unlock(&nn->client_lock); if (!session) goto out_no_session; status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(session->se_client, rqstp)) goto out; status = nfsd4_match_existing_connection(rqstp, session, bcts->dir, &conn); if (status == nfs_ok) { if (bcts->dir == NFS4_CDFC4_FORE_OR_BOTH || bcts->dir == NFS4_CDFC4_BACK) conn->cn_flags |= NFS4_CDFC4_BACK; nfsd4_probe_callback(session->se_client); goto out; } if (status == nfserr_inval) goto out; status = nfsd4_map_bcts_dir(&bcts->dir); if (status) goto out; conn = alloc_conn(rqstp, bcts->dir); status = nfserr_jukebox; if (!conn) goto out; nfsd4_init_conn(rqstp, conn, session); status = nfs_ok; out: nfsd4_put_session(session); out_no_session: return status; } static bool nfsd4_compound_in_session(struct nfsd4_compound_state *cstate, struct nfs4_sessionid *sid) { if (!cstate->session) return false; return !memcmp(sid, &cstate->session->se_sessionid, sizeof(*sid)); } __be32 nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfs4_sessionid *sessionid = &u->destroy_session.sessionid; struct nfsd4_session *ses; __be32 status; int ref_held_by_me = 0; struct net *net = SVC_NET(r); struct nfsd_net *nn = net_generic(net, nfsd_net_id); status = nfserr_not_only_op; if (nfsd4_compound_in_session(cstate, sessionid)) { if (!nfsd4_last_compound_op(r)) goto out; ref_held_by_me++; } dump_sessionid(__func__, sessionid); spin_lock(&nn->client_lock); ses = find_in_sessionid_hashtbl(sessionid, net, &status); if (!ses) goto out_client_lock; status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(ses->se_client, r)) goto out_put_session; status = mark_session_dead_locked(ses, 1 + ref_held_by_me); if (status) goto out_put_session; unhash_session(ses); spin_unlock(&nn->client_lock); nfsd4_probe_callback_sync(ses->se_client); spin_lock(&nn->client_lock); status = nfs_ok; out_put_session: nfsd4_put_session_locked(ses); out_client_lock: spin_unlock(&nn->client_lock); out: return status; } static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; struct nfsd4_conn *c; __be32 status = nfs_ok; int ret; spin_lock(&clp->cl_lock); c = __nfsd4_find_conn(new->cn_xprt, ses); if (c) goto out_free; status = nfserr_conn_not_bound_to_session; if (clp->cl_mach_cred) goto out_free; __nfsd4_hash_conn(new, ses); spin_unlock(&clp->cl_lock); ret = nfsd4_register_conn(new); if (ret) /* oops; xprt is already down: */ nfsd4_conn_lost(&new->cn_xpt_user); return nfs_ok; out_free: spin_unlock(&clp->cl_lock); free_conn(new); return status; } static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session) { struct nfsd4_compoundargs *args = rqstp->rq_argp; return args->opcnt > session->se_fchannel.maxops; } static bool nfsd4_request_too_big(struct svc_rqst *rqstp, struct nfsd4_session *session) { struct xdr_buf *xb = &rqstp->rq_arg; return xb->len > session->se_fchannel.maxreq_sz; } static bool replay_matches_cache(struct svc_rqst *rqstp, struct nfsd4_sequence *seq, struct nfsd4_slot *slot) { struct nfsd4_compoundargs *argp = rqstp->rq_argp; if ((bool)(slot->sl_flags & NFSD4_SLOT_CACHETHIS) != (bool)seq->cachethis) return false; /* * If there's an error then the reply can have fewer ops than * the call. */ if (slot->sl_opcnt < argp->opcnt && !slot->sl_status) return false; /* * But if we cached a reply with *more* ops than the call you're * sending us now, then this new call is clearly not really a * replay of the old one: */ if (slot->sl_opcnt > argp->opcnt) return false; /* This is the only check explicitly called by spec: */ if (!same_creds(&rqstp->rq_cred, &slot->sl_cred)) return false; /* * There may be more comparisons we could actually do, but the * spec doesn't require us to catch every case where the calls * don't match (that would require caching the call as well as * the reply), so we don't bother. */ return true; } __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_sequence *seq = &u->sequence; struct nfsd4_compoundres *resp = rqstp->rq_resp; struct xdr_stream *xdr = resp->xdr; struct nfsd4_session *session; struct nfs4_client *clp; struct nfsd4_slot *slot; struct nfsd4_conn *conn; __be32 status; int buflen; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (resp->opcnt != 1) return nfserr_sequence_pos; /* * Will be either used or freed by nfsd4_sequence_check_conn * below. */ conn = alloc_conn(rqstp, NFS4_CDFC4_FORE); if (!conn) return nfserr_jukebox; spin_lock(&nn->client_lock); session = find_in_sessionid_hashtbl(&seq->sessionid, net, &status); if (!session) goto out_no_session; clp = session->se_client; status = nfserr_too_many_ops; if (nfsd4_session_too_many_ops(rqstp, session)) goto out_put_session; status = nfserr_req_too_big; if (nfsd4_request_too_big(rqstp, session)) goto out_put_session; status = nfserr_badslot; if (seq->slotid >= session->se_fchannel.maxreqs) goto out_put_session; slot = xa_load(&session->se_slots, seq->slotid); dprintk("%s: slotid %d\n", __func__, seq->slotid); trace_nfsd_slot_seqid_sequence(clp, seq, slot); status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_flags); if (status == nfserr_replay_cache) { status = nfserr_seq_misordered; if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED)) goto out_put_session; status = nfserr_seq_false_retry; if (!replay_matches_cache(rqstp, seq, slot)) goto out_put_session; cstate->slot = slot; cstate->session = session; cstate->clp = clp; /* Return the cached reply status and set cstate->status * for nfsd4_proc_compound processing */ status = nfsd4_replay_cache_entry(resp, seq); cstate->status = nfserr_replay_cache; goto out; } if (status) goto out_put_session; status = nfsd4_sequence_check_conn(conn, session); conn = NULL; if (status) goto out_put_session; if (session->se_target_maxslots < session->se_fchannel.maxreqs && slot->sl_generation == session->se_slot_gen && seq->maxslots <= session->se_target_maxslots) /* Client acknowledged our reduce maxreqs */ free_session_slots(session, session->se_target_maxslots); buflen = (seq->cachethis) ? session->se_fchannel.maxresp_cached : session->se_fchannel.maxresp_sz; status = (seq->cachethis) ? nfserr_rep_too_big_to_cache : nfserr_rep_too_big; if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack)) goto out_put_session; svc_reserve(rqstp, buflen); status = nfs_ok; /* Success! accept new slot seqid */ slot->sl_seqid = seq->seqid; slot->sl_flags &= ~NFSD4_SLOT_REUSED; slot->sl_flags |= NFSD4_SLOT_INUSE; slot->sl_generation = session->se_slot_gen; if (seq->cachethis) slot->sl_flags |= NFSD4_SLOT_CACHETHIS; else slot->sl_flags &= ~NFSD4_SLOT_CACHETHIS; cstate->slot = slot; cstate->session = session; cstate->clp = clp; /* * If the client ever uses the highest available slot, * gently try to allocate another 20%. This allows * fairly quick growth without grossly over-shooting what * the client might use. */ if (seq->slotid == session->se_fchannel.maxreqs - 1 && session->se_target_maxslots >= session->se_fchannel.maxreqs && session->se_fchannel.maxreqs < NFSD_MAX_SLOTS_PER_SESSION) { int s = session->se_fchannel.maxreqs; int cnt = DIV_ROUND_UP(s, 5); void *prev_slot; do { /* * GFP_NOWAIT both allows allocation under a * spinlock, and only succeeds if there is * plenty of memory. */ slot = kzalloc(slot_bytes(&session->se_fchannel), GFP_NOWAIT); prev_slot = xa_load(&session->se_slots, s); if (xa_is_value(prev_slot) && slot) { slot->sl_seqid = xa_to_value(prev_slot); slot->sl_flags |= NFSD4_SLOT_REUSED; } if (slot && !xa_is_err(xa_store(&session->se_slots, s, slot, GFP_NOWAIT))) { s += 1; session->se_fchannel.maxreqs = s; atomic_add(s - session->se_target_maxslots, &nfsd_total_target_slots); session->se_target_maxslots = s; } else { kfree(slot); slot = NULL; } } while (slot && --cnt > 0); } seq->maxslots = max(session->se_target_maxslots, seq->maxslots); seq->target_maxslots = session->se_target_maxslots; out: switch (clp->cl_cb_state) { case NFSD4_CB_DOWN: seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; break; case NFSD4_CB_FAULT: seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; break; default: seq->status_flags = 0; } if (!list_empty(&clp->cl_revoked)) seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; if (atomic_read(&clp->cl_admin_revoked)) seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; trace_nfsd_seq4_status(rqstp, seq); out_no_session: if (conn) free_conn(conn); spin_unlock(&nn->client_lock); return status; out_put_session: nfsd4_put_session_locked(session); goto out_no_session; } void nfsd4_sequence_done(struct nfsd4_compoundres *resp) { struct nfsd4_compound_state *cs = &resp->cstate; if (nfsd4_has_session(cs)) { if (cs->status != nfserr_replay_cache) { nfsd4_store_cache_entry(resp); cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; } /* Drop session reference that was taken in nfsd4_sequence() */ nfsd4_put_session(cs->session); } else if (cs->clp) put_client_renew(cs->clp); } __be32 nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_destroy_clientid *dc = &u->destroy_clientid; struct nfs4_client *conf, *unconf; struct nfs4_client *clp = NULL; __be32 status = 0; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); spin_lock(&nn->client_lock); unconf = find_unconfirmed_client(&dc->clientid, true, nn); conf = find_confirmed_client(&dc->clientid, true, nn); WARN_ON_ONCE(conf && unconf); if (conf) { if (client_has_state(conf)) { status = nfserr_clientid_busy; goto out; } status = mark_client_expired_locked(conf); if (status) goto out; clp = conf; } else if (unconf) clp = unconf; else { status = nfserr_stale_clientid; goto out; } if (!nfsd4_mach_creds_match(clp, rqstp)) { clp = NULL; status = nfserr_wrong_cred; goto out; } trace_nfsd_clid_destroyed(&clp->cl_clientid); unhash_client_locked(clp); out: spin_unlock(&nn->client_lock); if (clp) expire_client(clp); return status; } __be32 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_reclaim_complete *rc = &u->reclaim_complete; struct nfs4_client *clp = cstate->clp; __be32 status = 0; if (rc->rca_one_fs) { if (!cstate->current_fh.fh_dentry) return nfserr_nofilehandle; /* * We don't take advantage of the rca_one_fs case. * That's OK, it's optional, we can safely ignore it. */ return nfs_ok; } status = nfserr_complete_already; if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) goto out; status = nfserr_stale_clientid; if (is_client_expired(clp)) /* * The following error isn't really legal. * But we only get here if the client just explicitly * destroyed the client. Surely it no longer cares what * error it gets back on an operation for the dead * client. */ goto out; status = nfs_ok; trace_nfsd_clid_reclaim_complete(&clp->cl_clientid); nfsd4_client_record_create(clp); inc_reclaim_complete(clp); out: return status; } __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_setclientid *setclid = &u->setclientid; struct xdr_netobj clname = setclid->se_name; nfs4_verifier clverifier = setclid->se_verf; struct nfs4_client *conf, *new; struct nfs4_client *unconf = NULL; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); new = create_client(clname, rqstp, &clverifier); if (new == NULL) return nfserr_jukebox; spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&clname, nn); if (conf && client_has_state(conf)) { status = nfserr_clid_inuse; if (clp_used_exchangeid(conf)) goto out; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } } unconf = find_unconfirmed_client_by_name(&clname, nn); if (unconf) unhash_client_locked(unconf); if (conf) { if (same_verf(&conf->cl_verifier, &clverifier)) { copy_clid(new, conf); gen_confirm(new, nn); } else trace_nfsd_clid_verf_mismatch(conf, rqstp, &clverifier); } else trace_nfsd_clid_fresh(new); new->cl_minorversion = 0; gen_callback(new, setclid, rqstp); add_to_unconfirmed(new); setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; setclid->se_clientid.cl_id = new->cl_clientid.cl_id; memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); new = NULL; status = nfs_ok; out: spin_unlock(&nn->client_lock); if (new) free_client(new); if (unconf) { trace_nfsd_clid_expire_unconf(&unconf->cl_clientid); expire_client(unconf); } return status; } __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_setclientid_confirm *setclientid_confirm = &u->setclientid_confirm; struct nfs4_client *conf, *unconf; struct nfs4_client *old = NULL; nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; spin_lock(&nn->client_lock); conf = find_confirmed_client(clid, false, nn); unconf = find_unconfirmed_client(clid, false, nn); /* * We try hard to give out unique clientid's, so if we get an * attempt to confirm the same clientid with a different cred, * the client may be buggy; this should never happen. * * Nevertheless, RFC 7530 recommends INUSE for this case: */ status = nfserr_clid_inuse; if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { trace_nfsd_clid_cred_mismatch(unconf, rqstp); goto out; } if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) { trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { if (conf && same_verf(&confirm, &conf->cl_confirm)) { status = nfs_ok; } else status = nfserr_stale_clientid; goto out; } status = nfs_ok; if (conf) { old = unconf; unhash_client_locked(old); nfsd4_change_callback(conf, &unconf->cl_cb_conn); } else { old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = nfserr_clid_inuse; if (client_has_state(old) && !same_creds(&unconf->cl_cred, &old->cl_cred)) { old = NULL; goto out; } status = mark_client_expired_locked(old); if (status) { old = NULL; goto out; } trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; } get_client_locked(conf); spin_unlock(&nn->client_lock); if (conf == unconf) fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); nfsd4_probe_callback(conf); spin_lock(&nn->client_lock); put_client_renew_locked(conf); out: spin_unlock(&nn->client_lock); if (old) expire_client(old); return status; } static struct nfs4_file *nfsd4_alloc_file(void) { return kmem_cache_alloc(file_slab, GFP_KERNEL); } /* OPEN Share state helper functions */ static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp) { refcount_set(&fp->fi_ref, 1); spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); INIT_LIST_HEAD(&fp->fi_clnt_odstate); fh_copy_shallow(&fp->fi_fhandle, &fh->fh_handle); fp->fi_deleg_file = NULL; fp->fi_had_conflict = false; fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); fp->fi_aliased = false; fp->fi_inode = d_inode(fh->fh_dentry); #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&fp->fi_lo_states); atomic_set(&fp->fi_lo_recalls, 0); #endif } void nfsd4_free_slabs(void) { kmem_cache_destroy(client_slab); kmem_cache_destroy(openowner_slab); kmem_cache_destroy(lockowner_slab); kmem_cache_destroy(file_slab); kmem_cache_destroy(stateid_slab); kmem_cache_destroy(deleg_slab); kmem_cache_destroy(odstate_slab); } int nfsd4_init_slabs(void) { client_slab = KMEM_CACHE(nfs4_client, 0); if (client_slab == NULL) goto out; openowner_slab = KMEM_CACHE(nfs4_openowner, 0); if (openowner_slab == NULL) goto out_free_client_slab; lockowner_slab = KMEM_CACHE(nfs4_lockowner, 0); if (lockowner_slab == NULL) goto out_free_openowner_slab; file_slab = KMEM_CACHE(nfs4_file, 0); if (file_slab == NULL) goto out_free_lockowner_slab; stateid_slab = KMEM_CACHE(nfs4_ol_stateid, 0); if (stateid_slab == NULL) goto out_free_file_slab; deleg_slab = KMEM_CACHE(nfs4_delegation, 0); if (deleg_slab == NULL) goto out_free_stateid_slab; odstate_slab = KMEM_CACHE(nfs4_clnt_odstate, 0); if (odstate_slab == NULL) goto out_free_deleg_slab; return 0; out_free_deleg_slab: kmem_cache_destroy(deleg_slab); out_free_stateid_slab: kmem_cache_destroy(stateid_slab); out_free_file_slab: kmem_cache_destroy(file_slab); out_free_lockowner_slab: kmem_cache_destroy(lockowner_slab); out_free_openowner_slab: kmem_cache_destroy(openowner_slab); out_free_client_slab: kmem_cache_destroy(client_slab); out: return -ENOMEM; } static unsigned long nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { int count; struct nfsd_net *nn = shrink->private_data; count = atomic_read(&nn->nfsd_courtesy_clients); if (!count) count = atomic_long_read(&num_delegations); if (count) queue_work(laundry_wq, &nn->nfsd_shrinker_work); return (unsigned long)count; } static unsigned long nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { return SHRINK_STOP; } void nfsd4_init_leases_net(struct nfsd_net *nn) { struct sysinfo si; u64 max_clients; nn->nfsd4_lease = 90; /* default lease time */ nn->nfsd4_grace = 90; nn->somebody_reclaimed = false; nn->track_reclaim_completes = false; nn->clverifier_counter = get_random_u32(); nn->clientid_base = get_random_u32(); nn->clientid_counter = nn->clientid_base + 1; nn->s2s_cp_cl_id = nn->clientid_counter++; atomic_set(&nn->nfs4_client_count, 0); si_meminfo(&si); max_clients = (u64)si.totalram * si.mem_unit / (1024 * 1024 * 1024); max_clients *= NFS4_CLIENTS_PER_GB; nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); atomic_set(&nn->nfsd_courtesy_clients, 0); } enum rp_lock { RP_UNLOCKED, RP_LOCKED, RP_UNHASHED, }; static void init_nfs4_replay(struct nfs4_replay *rp) { rp->rp_status = nfserr_serverfault; rp->rp_buflen = 0; rp->rp_buf = rp->rp_ibuf; rp->rp_locked = RP_UNLOCKED; } static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so) { if (!nfsd4_has_session(cstate)) { wait_var_event(&so->so_replay.rp_locked, cmpxchg(&so->so_replay.rp_locked, RP_UNLOCKED, RP_LOCKED) != RP_LOCKED); if (so->so_replay.rp_locked == RP_UNHASHED) return -EAGAIN; cstate->replay_owner = nfs4_get_stateowner(so); } return 0; } void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate) { struct nfs4_stateowner *so = cstate->replay_owner; if (so != NULL) { cstate->replay_owner = NULL; store_release_wake_up(&so->so_replay.rp_locked, RP_UNLOCKED); nfs4_put_stateowner(so); } } static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp) { struct nfs4_stateowner *sop; sop = kmem_cache_alloc(slab, GFP_KERNEL); if (!sop) return NULL; xdr_netobj_dup(&sop->so_owner, owner, GFP_KERNEL); if (!sop->so_owner.data) { kmem_cache_free(slab, sop); return NULL; } INIT_LIST_HEAD(&sop->so_stateids); sop->so_client = clp; init_nfs4_replay(&sop->so_replay); atomic_set(&sop->so_count, 1); return sop; } static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) { lockdep_assert_held(&clp->cl_lock); list_add(&oo->oo_owner.so_strhash, &clp->cl_ownerstr_hashtbl[strhashval]); list_add(&oo->oo_perclient, &clp->cl_openowners); } static void nfs4_unhash_openowner(struct nfs4_stateowner *so) { unhash_openowner_locked(openowner(so)); } static void nfs4_free_openowner(struct nfs4_stateowner *so) { struct nfs4_openowner *oo = openowner(so); kmem_cache_free(openowner_slab, oo); } static const struct nfs4_stateowner_operations openowner_ops = { .so_unhash = nfs4_unhash_openowner, .so_free = nfs4_free_openowner, }; static struct nfs4_ol_stateid * nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_ol_stateid *local, *ret = NULL; struct nfs4_openowner *oo = open->op_openowner; lockdep_assert_held(&fp->fi_lock); list_for_each_entry(local, &fp->fi_stateids, st_perfile) { /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; if (local->st_stateowner != &oo->oo_owner) continue; if (local->st_stid.sc_type == SC_TYPE_OPEN && !local->st_stid.sc_status) { ret = local; refcount_inc(&ret->st_stid.sc_count); break; } } return ret; } static void nfsd4_drop_revoked_stid(struct nfs4_stid *s) __releases(&s->sc_client->cl_lock) { struct nfs4_client *cl = s->sc_client; LIST_HEAD(reaplist); struct nfs4_ol_stateid *stp; struct nfs4_delegation *dp; bool unhashed; switch (s->sc_type) { case SC_TYPE_OPEN: stp = openlockstateid(s); if (unhash_open_stateid(stp, &reaplist)) put_ol_stateid_locked(stp, &reaplist); spin_unlock(&cl->cl_lock); free_ol_stateid_reaplist(&reaplist); break; case SC_TYPE_LOCK: stp = openlockstateid(s); unhashed = unhash_lock_stateid(stp); spin_unlock(&cl->cl_lock); if (unhashed) nfs4_put_stid(s); break; case SC_TYPE_DELEG: dp = delegstateid(s); list_del_init(&dp->dl_recall_lru); spin_unlock(&cl->cl_lock); nfs4_put_stid(s); break; default: spin_unlock(&cl->cl_lock); } } static void nfsd40_drop_revoked_stid(struct nfs4_client *cl, stateid_t *stid) { /* NFSv4.0 has no way for the client to tell the server * that it can forget an admin-revoked stateid. * So we keep it around until the first time that the * client uses it, and drop it the first time * nfserr_admin_revoked is returned. * For v4.1 and later we wait until explicitly told * to free the stateid. */ if (cl->cl_minorversion == 0) { struct nfs4_stid *st; spin_lock(&cl->cl_lock); st = find_stateid_locked(cl, stid); if (st) nfsd4_drop_revoked_stid(st); else spin_unlock(&cl->cl_lock); } } static __be32 nfsd4_verify_open_stid(struct nfs4_stid *s) { __be32 ret = nfs_ok; if (s->sc_status & SC_STATUS_ADMIN_REVOKED) ret = nfserr_admin_revoked; else if (s->sc_status & SC_STATUS_REVOKED) ret = nfserr_deleg_revoked; else if (s->sc_status & SC_STATUS_CLOSED) ret = nfserr_bad_stateid; return ret; } /* Lock the stateid st_mutex, and deal with races with CLOSE */ static __be32 nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp) { __be32 ret; mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); ret = nfsd4_verify_open_stid(&stp->st_stid); if (ret == nfserr_admin_revoked) nfsd40_drop_revoked_stid(stp->st_stid.sc_client, &stp->st_stid.sc_stateid); if (ret != nfs_ok) mutex_unlock(&stp->st_mutex); return ret; } static struct nfs4_ol_stateid * nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_ol_stateid *stp; for (;;) { spin_lock(&fp->fi_lock); stp = nfsd4_find_existing_open(fp, open); spin_unlock(&fp->fi_lock); if (!stp || nfsd4_lock_ol_stateid(stp) == nfs_ok) break; nfs4_put_stid(&stp->st_stid); } return stp; } static struct nfs4_openowner * find_or_alloc_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, struct nfsd4_compound_state *cstate) { struct nfs4_client *clp = cstate->clp; struct nfs4_openowner *oo, *new = NULL; retry: spin_lock(&clp->cl_lock); oo = find_openstateowner_str(strhashval, open, clp); if (!oo && new) { hash_openowner(new, clp, strhashval); spin_unlock(&clp->cl_lock); return new; } spin_unlock(&clp->cl_lock); if (oo && !(oo->oo_flags & NFS4_OO_CONFIRMED)) { /* Replace unconfirmed owners without checking for replay. */ release_openowner(oo); oo = NULL; } if (oo) { if (new) nfs4_free_stateowner(&new->oo_owner); return oo; } new = alloc_stateowner(openowner_slab, &open->op_owner, clp); if (!new) return NULL; new->oo_owner.so_ops = &openowner_ops; new->oo_owner.so_is_open_owner = 1; new->oo_owner.so_seqid = open->op_seqid; new->oo_flags = 0; if (nfsd4_has_session(cstate)) new->oo_flags |= NFS4_OO_CONFIRMED; new->oo_time = 0; new->oo_last_closed_stid = NULL; INIT_LIST_HEAD(&new->oo_close_lru); goto retry; } static struct nfs4_ol_stateid * init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; struct nfs4_ol_stateid *retstp = NULL; struct nfs4_ol_stateid *stp; stp = open->op_stp; /* We are moving these outside of the spinlocks to avoid the warnings */ mutex_init(&stp->st_mutex); mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); retry: spin_lock(&oo->oo_owner.so_client->cl_lock); spin_lock(&fp->fi_lock); if (nfs4_openowner_unhashed(oo)) { mutex_unlock(&stp->st_mutex); stp = NULL; goto out_unlock; } retstp = nfsd4_find_existing_open(fp, open); if (retstp) goto out_unlock; open->op_stp = NULL; refcount_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = SC_TYPE_OPEN; INIT_LIST_HEAD(&stp->st_locks); stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; stp->st_openstp = NULL; list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&oo->oo_owner.so_client->cl_lock); if (retstp) { /* Handle races with CLOSE */ if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { nfs4_put_stid(&retstp->st_stid); goto retry; } /* To keep mutex tracking happy */ mutex_unlock(&stp->st_mutex); stp = retstp; } return stp; } /* * In the 4.0 case we need to keep the owners around a little while to handle * CLOSE replay. We still do need to release any file access that is held by * them before returning however. */ static void move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) { struct nfs4_ol_stateid *last; struct nfs4_openowner *oo = openowner(s->st_stateowner); struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net, nfsd_net_id); dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); /* * We know that we hold one reference via nfsd4_close, and another * "persistent" reference for the client. If the refcount is higher * than 2, then there are still calls in progress that are using this * stateid. We can't put the sc_file reference until they are finished. * Wait for the refcount to drop to 2. Since it has been unhashed, * there should be no danger of the refcount going back up again at * this point. * Some threads with a reference might be waiting for rp_locked, * so tell them to stop waiting. */ store_release_wake_up(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED); wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2); release_all_access(s); if (s->st_stid.sc_file) { put_nfs4_file(s->st_stid.sc_file); s->st_stid.sc_file = NULL; } spin_lock(&nn->client_lock); last = oo->oo_last_closed_stid; oo->oo_last_closed_stid = s; list_move_tail(&oo->oo_close_lru, &nn->close_lru); oo->oo_time = ktime_get_boottime_seconds(); spin_unlock(&nn->client_lock); if (last) nfs4_put_stid(&last->st_stid); } static noinline_for_stack struct nfs4_file * nfsd4_file_hash_lookup(const struct svc_fh *fhp) { struct inode *inode = d_inode(fhp->fh_dentry); struct rhlist_head *tmp, *list; struct nfs4_file *fi; rcu_read_lock(); list = rhltable_lookup(&nfs4_file_rhltable, &inode, nfs4_file_rhash_params); rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { if (refcount_inc_not_zero(&fi->fi_ref)) { rcu_read_unlock(); return fi; } } } rcu_read_unlock(); return NULL; } /* * On hash insertion, identify entries with the same inode but * distinct filehandles. They will all be on the list returned * by rhltable_lookup(). * * inode->i_lock prevents racing insertions from adding an entry * for the same inode/fhp pair twice. */ static noinline_for_stack struct nfs4_file * nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) { struct inode *inode = d_inode(fhp->fh_dentry); struct rhlist_head *tmp, *list; struct nfs4_file *ret = NULL; bool alias_found = false; struct nfs4_file *fi; int err; rcu_read_lock(); spin_lock(&inode->i_lock); list = rhltable_lookup(&nfs4_file_rhltable, &inode, nfs4_file_rhash_params); rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { if (refcount_inc_not_zero(&fi->fi_ref)) ret = fi; } else fi->fi_aliased = alias_found = true; } if (ret) goto out_unlock; nfsd4_file_init(fhp, new); err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist, nfs4_file_rhash_params); if (err) goto out_unlock; new->fi_aliased = alias_found; ret = new; out_unlock: spin_unlock(&inode->i_lock); rcu_read_unlock(); return ret; } static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) { rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist, nfs4_file_rhash_params); } /* * Called to check deny when READ with all zero stateid or * WRITE with all zero or all one stateid */ static __be32 nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) { struct nfs4_file *fp; __be32 ret = nfs_ok; fp = nfsd4_file_hash_lookup(current_fh); if (!fp) return ret; /* Check for conflicting share reservations */ spin_lock(&fp->fi_lock); if (fp->fi_share_deny & deny_type) ret = nfserr_locked; spin_unlock(&fp->fi_lock); put_nfs4_file(fp); return ret; } static bool nfsd4_deleg_present(const struct inode *inode) { struct file_lock_context *ctx = locks_inode_context(inode); return ctx && !list_empty_careful(&ctx->flc_lease); } /** * nfsd_wait_for_delegreturn - wait for delegations to be returned * @rqstp: the RPC transaction being executed * @inode: in-core inode of the file being waited for * * The timeout prevents deadlock if all nfsd threads happen to be * tied up waiting for returning delegations. * * Return values: * %true: delegation was returned * %false: timed out waiting for delegreturn */ bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode) { long __maybe_unused timeo; timeo = wait_var_event_timeout(inode, !nfsd4_deleg_present(inode), NFSD_DELEGRETURN_TIMEOUT); trace_nfsd_delegret_wakeup(rqstp, inode, timeo); return timeo > 0; } static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) { struct nfs4_delegation *dp = cb_to_delegation(cb); struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, nfsd_net_id); block_delegations(&dp->dl_stid.sc_file->fi_fhandle); /* * We can't do this in nfsd_break_deleg_cb because it is * already holding inode->i_lock. * * If the dl_time != 0, then we know that it has already been * queued for a lease break. Don't queue it again. */ spin_lock(&state_lock); if (delegation_hashed(dp) && dp->dl_time == 0) { dp->dl_time = ktime_get_boottime_seconds(); list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); } spin_unlock(&state_lock); } static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, struct rpc_task *task) { struct nfs4_delegation *dp = cb_to_delegation(cb); trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task); if (dp->dl_stid.sc_status) /* CLOSED or REVOKED */ return 1; switch (task->tk_status) { case 0: return 1; case -NFS4ERR_DELAY: rpc_delay(task, 2 * HZ); return 0; case -EBADHANDLE: case -NFS4ERR_BAD_STATEID: /* * Race: client probably got cb_recall before open reply * granting delegation. */ if (dp->dl_retries--) { rpc_delay(task, 2 * HZ); return 0; } fallthrough; default: return 1; } } static void nfsd4_cb_recall_release(struct nfsd4_callback *cb) { struct nfs4_delegation *dp = cb_to_delegation(cb); nfs4_put_stid(&dp->dl_stid); } static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = { .prepare = nfsd4_cb_recall_prepare, .done = nfsd4_cb_recall_done, .release = nfsd4_cb_recall_release, .opcode = OP_CB_RECALL, }; static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { /* * We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the * flc_lock) we know the server hasn't removed the lease yet, and * we know it's safe to take a reference. */ refcount_inc(&dp->dl_stid.sc_count); WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall)); } /* Called from break_lease() with flc_lock held. */ static bool nfsd_break_deleg_cb(struct file_lease *fl) { struct nfs4_delegation *dp = (struct nfs4_delegation *) fl->c.flc_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; struct nfs4_client *clp = dp->dl_stid.sc_client; struct nfsd_net *nn; trace_nfsd_cb_recall(&dp->dl_stid); dp->dl_recalled = true; atomic_inc(&clp->cl_delegs_in_recall); if (try_to_expire_client(clp)) { nn = net_generic(clp->net, nfsd_net_id); mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); } /* * We don't want the locks code to timeout the lease for us; * we'll remove it ourself if a delegation isn't returned * in time: */ fl->fl_break_time = 0; fp->fi_had_conflict = true; nfsd_break_one_deleg(dp); return false; } /** * nfsd_breaker_owns_lease - Check if lease conflict was resolved * @fl: Lock state to check * * Return values: * %true: Lease conflict was resolved * %false: Lease conflict was not resolved. */ static bool nfsd_breaker_owns_lease(struct file_lease *fl) { struct nfs4_delegation *dl = fl->c.flc_owner; struct svc_rqst *rqst; struct nfs4_client *clp; rqst = nfsd_current_rqst(); if (!nfsd_v4client(rqst)) return false; clp = *(rqst->rq_lease_breaker); return dl->dl_stid.sc_client == clp; } static int nfsd_change_deleg_cb(struct file_lease *onlist, int arg, struct list_head *dispose) { struct nfs4_delegation *dp = (struct nfs4_delegation *) onlist->c.flc_owner; struct nfs4_client *clp = dp->dl_stid.sc_client; if (arg & F_UNLCK) { if (dp->dl_recalled) atomic_dec(&clp->cl_delegs_in_recall); return lease_modify(onlist, arg, dispose); } else return -EAGAIN; } static const struct lease_manager_operations nfsd_lease_mng_ops = { .lm_breaker_owns_lease = nfsd_breaker_owns_lease, .lm_break = nfsd_break_deleg_cb, .lm_change = nfsd_change_deleg_cb, }; static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid) { if (nfsd4_has_session(cstate)) return nfs_ok; if (seqid == so->so_seqid - 1) return nfserr_replay_me; if (seqid == so->so_seqid) return nfs_ok; return nfserr_bad_seqid; } static struct nfs4_client *lookup_clientid(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct nfs4_client *found; spin_lock(&nn->client_lock); found = find_confirmed_client(clid, sessions, nn); if (found) atomic_inc(&found->cl_rpc_users); spin_unlock(&nn->client_lock); return found; } static __be32 set_client(clientid_t *clid, struct nfsd4_compound_state *cstate, struct nfsd_net *nn) { if (cstate->clp) { if (!same_clid(&cstate->clp->cl_clientid, clid)) return nfserr_stale_clientid; return nfs_ok; } if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; /* * We're in the 4.0 case (otherwise the SEQUENCE op would have * set cstate->clp), so session = false: */ cstate->clp = lookup_clientid(clid, false, nn); if (!cstate->clp) return nfserr_expired; return nfs_ok; } __be32 nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct nfsd_net *nn) { clientid_t *clientid = &open->op_clientid; struct nfs4_client *clp = NULL; unsigned int strhashval; struct nfs4_openowner *oo = NULL; __be32 status; /* * In case we need it later, after we've already created the * file and don't want to risk a further failure: */ open->op_file = nfsd4_alloc_file(); if (open->op_file == NULL) return nfserr_jukebox; status = set_client(clientid, cstate, nn); if (status) return status; clp = cstate->clp; strhashval = ownerstr_hashval(&open->op_owner); retry: oo = find_or_alloc_open_stateowner(strhashval, open, cstate); open->op_openowner = oo; if (!oo) return nfserr_jukebox; if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) { nfs4_put_stateowner(&oo->oo_owner); goto retry; } status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); if (status) return status; open->op_stp = nfs4_alloc_open_stateid(clp); if (!open->op_stp) return nfserr_jukebox; if (nfsd4_has_session(cstate) && (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) { open->op_odstate = alloc_clnt_odstate(clp); if (!open->op_odstate) return nfserr_jukebox; } return nfs_ok; } static inline __be32 nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) { if (!(flags & RD_STATE) && deleg_is_read(dp->dl_type)) return nfserr_openmode; else return nfs_ok; } static int share_access_to_flags(u32 share_access) { return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) { struct nfs4_stid *ret; ret = find_stateid_by_type(cl, s, SC_TYPE_DELEG, SC_STATUS_REVOKED); if (!ret) return NULL; return delegstateid(ret); } static bool nfsd4_is_deleg_cur(struct nfsd4_open *open) { return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR || open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH; } static __be32 nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, struct nfs4_delegation **dp) { int flags; __be32 status = nfserr_bad_stateid; struct nfs4_delegation *deleg; deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); if (deleg == NULL) goto out; if (deleg->dl_stid.sc_status & SC_STATUS_ADMIN_REVOKED) { nfs4_put_stid(&deleg->dl_stid); status = nfserr_admin_revoked; goto out; } if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) { nfs4_put_stid(&deleg->dl_stid); nfsd40_drop_revoked_stid(cl, &open->op_delegate_stateid); status = nfserr_deleg_revoked; goto out; } flags = share_access_to_flags(open->op_share_access); status = nfs4_check_delegmode(deleg, flags); if (status) { nfs4_put_stid(&deleg->dl_stid); goto out; } *dp = deleg; out: if (!nfsd4_is_deleg_cur(open)) return nfs_ok; if (status) return status; open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; return nfs_ok; } static inline int nfs4_access_to_access(u32 nfs4_access) { int flags = 0; if (nfs4_access & NFS4_SHARE_ACCESS_READ) flags |= NFSD_MAY_READ; if (nfs4_access & NFS4_SHARE_ACCESS_WRITE) flags |= NFSD_MAY_WRITE; return flags; } static inline __be32 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, struct nfsd4_open *open) { struct iattr iattr = { .ia_valid = ATTR_SIZE, .ia_size = 0, }; struct nfsd_attrs attrs = { .na_iattr = &iattr, }; if (!open->op_truncate) return 0; if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) return nfserr_inval; return nfsd_setattr(rqstp, fh, &attrs, NULL); } static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open, bool new_stp) { struct nfsd_file *nf = NULL; __be32 status; int oflag = nfs4_access_to_omode(open->op_share_access); int access = nfs4_access_to_access(open->op_share_access); unsigned char old_access_bmap, old_deny_bmap; spin_lock(&fp->fi_lock); /* * Are we trying to set a deny mode that would conflict with * current access? */ status = nfs4_file_check_deny(fp, open->op_share_deny); if (status != nfs_ok) { if (status != nfserr_share_denied) { spin_unlock(&fp->fi_lock); goto out; } if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, stp, open->op_share_deny, false)) status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } /* set access to the file */ status = nfs4_file_get_access(fp, open->op_share_access); if (status != nfs_ok) { if (status != nfserr_share_denied) { spin_unlock(&fp->fi_lock); goto out; } if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, stp, open->op_share_access, true)) status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } /* Set access bits in stateid */ old_access_bmap = stp->st_access_bmap; set_access(open->op_share_access, stp); /* Set new deny mask */ old_deny_bmap = stp->st_deny_bmap; set_deny(open->op_share_deny, stp); fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); status = nfsd_file_acquire_opened(rqstp, cur_fh, access, open->op_filp, &nf); if (status != nfs_ok) goto out_put_access; spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { fp->fi_fds[oflag] = nf; nf = NULL; } } spin_unlock(&fp->fi_lock); if (nf) nfsd_file_put(nf); status = nfserrno(nfsd_open_break_lease(cur_fh->fh_dentry->d_inode, access)); if (status) goto out_put_access; status = nfsd4_truncate(rqstp, cur_fh, open); if (status) goto out_put_access; out: return status; out_put_access: stp->st_access_bmap = old_access_bmap; nfs4_file_put_access(fp, open->op_share_access); reset_union_bmap_deny(bmap_to_share_mode(old_deny_bmap), stp); goto out; } static __be32 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { __be32 status; unsigned char old_deny_bmap = stp->st_deny_bmap; if (!test_access(open->op_share_access, stp)) return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open, false); /* test and set deny mode */ spin_lock(&fp->fi_lock); status = nfs4_file_check_deny(fp, open->op_share_deny); switch (status) { case nfs_ok: set_deny(open->op_share_deny, stp); fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); break; case nfserr_share_denied: if (nfs4_resolve_deny_conflicts_locked(fp, false, stp, open->op_share_deny, false)) status = nfserr_jukebox; break; } spin_unlock(&fp->fi_lock); if (status != nfs_ok) return status; status = nfsd4_truncate(rqstp, cur_fh, open); if (status != nfs_ok) reset_union_bmap_deny(old_deny_bmap, stp); return status; } /* Should we give out recallable state?: */ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) { if (clp->cl_cb_state == NFSD4_CB_UP) return true; /* * In the sessions case, since we don't have to establish a * separate connection for callbacks, we assume it's OK * until we hear otherwise: */ return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; } static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp) { struct file_lease *fl; fl = locks_alloc_lease(); if (!fl) return NULL; fl->fl_lmops = &nfsd_lease_mng_ops; fl->c.flc_flags = FL_DELEG; fl->c.flc_type = deleg_is_read(dp->dl_type) ? F_RDLCK : F_WRLCK; fl->c.flc_owner = (fl_owner_t)dp; fl->c.flc_pid = current->tgid; fl->c.flc_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; return fl; } static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, struct nfs4_file *fp) { struct nfs4_ol_stateid *st; struct file *f = fp->fi_deleg_file->nf_file; struct inode *ino = file_inode(f); int writes; writes = atomic_read(&ino->i_writecount); if (!writes) return 0; /* * There could be multiple filehandles (hence multiple * nfs4_files) referencing this file, but that's not too * common; let's just give up in that case rather than * trying to go look up all the clients using that other * nfs4_file as well: */ if (fp->fi_aliased) return -EAGAIN; /* * If there's a close in progress, make sure that we see it * clear any fi_fds[] entries before we see it decrement * i_writecount: */ smp_mb__after_atomic(); if (fp->fi_fds[O_WRONLY]) writes--; if (fp->fi_fds[O_RDWR]) writes--; if (writes > 0) return -EAGAIN; /* There may be non-NFSv4 writers */ /* * It's possible there are non-NFSv4 write opens in progress, * but if they haven't incremented i_writecount yet then they * also haven't called break lease yet; so, they'll break this * lease soon enough. So, all that's left to check for is NFSv4 * opens: */ spin_lock(&fp->fi_lock); list_for_each_entry(st, &fp->fi_stateids, st_perfile) { if (st->st_openstp == NULL /* it's an open */ && access_permit_write(st) && st->st_stid.sc_client != clp) { spin_unlock(&fp->fi_lock); return -EAGAIN; } } spin_unlock(&fp->fi_lock); /* * There's a small chance that we could be racing with another * NFSv4 open. However, any open that hasn't added itself to * the fi_stateids list also hasn't called break_lease yet; so, * they'll break this lease soon enough. */ return 0; } /* * It's possible that between opening the dentry and setting the delegation, * that it has been renamed or unlinked. Redo the lookup to verify that this * hasn't happened. */ static int nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp, struct svc_fh *parent) { struct svc_export *exp; struct dentry *child; __be32 err; err = nfsd_lookup_dentry(open->op_rqstp, parent, open->op_fname, open->op_fnamelen, &exp, &child); if (err) return -EAGAIN; exp_put(exp); dput(child); if (child != file_dentry(fp->fi_deleg_file->nf_file)) return -EAGAIN; return 0; } /* * We avoid breaking delegations held by a client due to its own activity, but * clearing setuid/setgid bits on a write is an implicit activity and the client * may not notice and continue using the old mode. Avoid giving out a delegation * on setuid/setgid files when the client is requesting an open for write. */ static int nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf) { struct inode *inode = file_inode(nf->nf_file); if ((open->op_share_access & NFS4_SHARE_ACCESS_WRITE) && (inode->i_mode & (S_ISUID|S_ISGID))) return -EAGAIN; return 0; } static struct nfs4_delegation * nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *parent) { bool deleg_ts = open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; struct nfs4_client *clp = stp->st_stid.sc_client; struct nfs4_file *fp = stp->st_stid.sc_file; struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; struct nfs4_delegation *dp; struct nfsd_file *nf = NULL; struct file_lease *fl; int status = 0; u32 dl_type; /* * The fi_had_conflict and nfs_get_existing_delegation checks * here are just optimizations; we'll need to recheck them at * the end: */ if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); /* * Try for a write delegation first. RFC8881 section 10.4 says: * * "An OPEN_DELEGATE_WRITE delegation allows the client to handle, * on its own, all opens." * * Furthermore the client can use a write delegation for most READ * operations as well, so we require a O_RDWR file here. * * Offer a write delegation in the case of a BOTH open, and ensure * we get the O_RDWR descriptor. */ if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) { nf = find_rw_file(fp); dl_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG : OPEN_DELEGATE_WRITE; } /* * If the file is being opened O_RDONLY or we couldn't get a O_RDWR * file for some reason, then try for a read delegation instead. */ if (!nf && (open->op_share_access & NFS4_SHARE_ACCESS_READ)) { nf = find_readable_file(fp); dl_type = deleg_ts ? OPEN_DELEGATE_READ_ATTRS_DELEG : OPEN_DELEGATE_READ; } if (!nf) return ERR_PTR(-EAGAIN); spin_lock(&state_lock); spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) status = -EAGAIN; else if (nfsd4_verify_setuid_write(open, nf)) status = -EAGAIN; else if (!fp->fi_deleg_file) { fp->fi_deleg_file = nf; /* increment early to prevent fi_deleg_file from being * cleared */ fp->fi_delegees = 1; nf = NULL; } else fp->fi_delegees++; spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); if (nf) nfsd_file_put(nf); if (status) return ERR_PTR(status); status = -ENOMEM; dp = alloc_init_deleg(clp, fp, odstate, dl_type); if (!dp) goto out_delegees; fl = nfs4_alloc_init_lease(dp); if (!fl) goto out_clnt_odstate; status = kernel_setlease(fp->fi_deleg_file->nf_file, fl->c.flc_type, &fl, NULL); if (fl) locks_free_lease(fl); if (status) goto out_clnt_odstate; if (parent) { status = nfsd4_verify_deleg_dentry(open, fp, parent); if (status) goto out_unlock; } status = nfsd4_check_conflicting_opens(clp, fp); if (status) goto out_unlock; /* * Now that the deleg is set, check again to ensure that nothing * raced in and changed the mode while we weren't looking. */ status = nfsd4_verify_setuid_write(open, fp->fi_deleg_file); if (status) goto out_unlock; status = -EAGAIN; if (fp->fi_had_conflict) goto out_unlock; spin_lock(&state_lock); spin_lock(&clp->cl_lock); spin_lock(&fp->fi_lock); status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&clp->cl_lock); spin_unlock(&state_lock); if (status) goto out_unlock; return dp; out_unlock: kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); out_delegees: put_deleg_file(fp); return ERR_PTR(status); } static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) { open->op_delegate_type = OPEN_DELEGATE_NONE_EXT; if (status == -EAGAIN) open->op_why_no_deleg = WND4_CONTENTION; else { open->op_why_no_deleg = WND4_RESOURCE; switch (open->op_deleg_want) { case OPEN4_SHARE_ACCESS_WANT_READ_DELEG: case OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG: case OPEN4_SHARE_ACCESS_WANT_ANY_DELEG: break; case OPEN4_SHARE_ACCESS_WANT_CANCEL: open->op_why_no_deleg = WND4_CANCELLED; break; case OPEN4_SHARE_ACCESS_WANT_NO_DELEG: WARN_ON_ONCE(1); } } } static bool nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh, struct kstat *stat) { struct nfsd_file *nf = find_rw_file(dp->dl_stid.sc_file); struct path path; int rc; if (!nf) return false; path.mnt = currentfh->fh_export->ex_path.mnt; path.dentry = file_dentry(nf->nf_file); rc = vfs_getattr(&path, stat, (STATX_MODE | STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE), AT_STATX_SYNC_AS_STAT); nfsd_file_put(nf); return rc == 0; } /* * The Linux NFS server does not offer write delegations to NFSv4.0 * clients in order to avoid conflicts between write delegations and * GETATTRs requesting CHANGE or SIZE attributes. * * With NFSv4.1 and later minorversions, the SEQUENCE operation that * begins each COMPOUND contains a client ID. Delegation recall can * be avoided when the server recognizes the client sending a * GETATTR also holds write delegation it conflicts with. * * However, the NFSv4.0 protocol does not enable a server to * determine that a GETATTR originated from the client holding the * conflicting delegation versus coming from some other client. Per * RFC 7530 Section 16.7.5, the server must recall or send a * CB_GETATTR even when the GETATTR originates from the client that * holds the conflicting delegation. * * An NFSv4.0 client can trigger a pathological situation if it * always sends a DELEGRETURN preceded by a conflicting GETATTR in * the same COMPOUND. COMPOUND execution will always stop at the * GETATTR and the DELEGRETURN will never get executed. The server * eventually revokes the delegation, which can result in loss of * open or lock state. */ static void nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *currentfh) { bool deleg_ts = open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; struct nfs4_openowner *oo = openowner(stp->st_stateowner); struct nfs4_client *clp = stp->st_stid.sc_client; struct svc_fh *parent = NULL; struct nfs4_delegation *dp; struct kstat stat; int status = 0; int cb_up; cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); open->op_recall = false; switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_PREVIOUS: if (!cb_up) open->op_recall = true; break; case NFS4_OPEN_CLAIM_NULL: parent = currentfh; fallthrough; case NFS4_OPEN_CLAIM_FH: /* * Let's not give out any delegations till everyone's * had the chance to reclaim theirs, *and* until * NLM locks have all been reclaimed: */ if (locks_in_grace(clp->net)) goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE && !clp->cl_minorversion) goto out_no_deleg; break; default: goto out_no_deleg; } dp = nfs4_set_delegation(open, stp, parent); if (IS_ERR(dp)) goto out_no_deleg; memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { if (!nfs4_delegation_stat(dp, currentfh, &stat)) { nfs4_put_stid(&dp->dl_stid); destroy_delegation(dp); goto out_no_deleg; } open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG : OPEN_DELEGATE_WRITE; dp->dl_cb_fattr.ncf_cur_fsize = stat.size; dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat); trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); } else { open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_READ_ATTRS_DELEG : OPEN_DELEGATE_READ; trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); } nfs4_put_stid(&dp->dl_stid); return; out_no_deleg: open->op_delegate_type = OPEN_DELEGATE_NONE; if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && open->op_delegate_type != OPEN_DELEGATE_NONE) { dprintk("NFSD: WARNING: refusing delegation reclaim\n"); open->op_recall = true; } /* 4.1 client asking for a delegation? */ if (open->op_deleg_want) nfsd4_open_deleg_none_ext(open, status); return; } static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, struct nfs4_delegation *dp) { if (deleg_is_write(dp->dl_type)) { if (open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_READ_DELEG) { open->op_delegate_type = OPEN_DELEGATE_NONE_EXT; open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE; } else if (open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG) { open->op_delegate_type = OPEN_DELEGATE_NONE_EXT; open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE; } } /* Otherwise the client must be confused wanting a delegation * it already has, therefore we don't return * OPEN_DELEGATE_NONE_EXT and reason. */ } /* Are we returning only a delegation stateid? */ static bool open_xor_delegation(struct nfsd4_open *open) { if (!(open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION)) return false; /* Did we actually get a delegation? */ if (!deleg_is_read(open->op_delegate_type) && !deleg_is_write(open->op_delegate_type)) return false; return true; } /** * nfsd4_process_open2 - finish open processing * @rqstp: the RPC transaction being executed * @current_fh: NFSv4 COMPOUND's current filehandle * @open: OPEN arguments * * If successful, (1) truncate the file if open->op_truncate was * set, (2) set open->op_stateid, (3) set open->op_delegation. * * Returns %nfs_ok on success; otherwise an nfs4stat value in * network byte order is returned. */ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; bool new_stp = false; /* * Lookup file; if found, lookup stateid and check open request, * and check for delegations in the process of being recalled. * If not found, create the nfs4_file struct */ fp = nfsd4_file_hash_insert(open->op_file, current_fh); if (unlikely(!fp)) return nfserr_jukebox; if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; stp = nfsd4_find_and_lock_existing_open(fp, open); } else { open->op_file = NULL; status = nfserr_bad_stateid; if (nfsd4_is_deleg_cur(open)) goto out; } if (!stp) { stp = init_open_stateid(fp, open); if (!stp) { status = nfserr_jukebox; goto out; } if (!open->op_stp) new_stp = true; } /* * OPEN the file, or upgrade an existing OPEN. * If truncate fails, the OPEN fails. * * stp is already locked. */ if (!new_stp) { /* Stateid was found, this is an OPEN upgrade */ status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { mutex_unlock(&stp->st_mutex); goto out; } } else { status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true); if (status) { release_open_stateid(stp); mutex_unlock(&stp->st_mutex); goto out; } stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp, open->op_odstate); if (stp->st_clnt_odstate == open->op_odstate) open->op_odstate = NULL; } nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); mutex_unlock(&stp->st_mutex); if (nfsd4_has_session(&resp->cstate)) { if (open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_NO_DELEG) { open->op_delegate_type = OPEN_DELEGATE_NONE_EXT; open->op_why_no_deleg = WND4_NOT_WANTED; goto nodeleg; } } /* * Attempt to hand out a delegation. No error return, because the * OPEN succeeds even if we fail. */ nfs4_open_delegation(open, stp, &resp->cstate.current_fh); /* * If there is an existing open stateid, it must be updated and * returned. Only respect WANT_OPEN_XOR_DELEGATION when a new * open stateid would have to be created. */ if (new_stp && open_xor_delegation(open)) { memcpy(&open->op_stateid, &zero_stateid, sizeof(open->op_stateid)); open->op_rflags |= OPEN4_RESULT_NO_OPEN_STATEID; release_open_stateid(stp); } nodeleg: status = nfs_ok; trace_nfsd_open(&stp->st_stid.sc_stateid); out: /* 4.1 client trying to upgrade/downgrade delegation? */ if (open->op_delegate_type == OPEN_DELEGATE_NONE && dp && open->op_deleg_want) nfsd4_deleg_xgrade_none_ext(open, dp); if (fp) put_nfs4_file(fp); if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; /* * To finish the open response, we just need to set the rflags. */ open->op_rflags |= NFS4_OPEN_RESULT_LOCKTYPE_POSIX; if (nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK; else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; if (dp) nfs4_put_stid(&dp->dl_stid); if (stp) nfs4_put_stid(&stp->st_stid); return status; } void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { if (open->op_openowner) nfs4_put_stateowner(&open->op_openowner->oo_owner); if (open->op_file) kmem_cache_free(file_slab, open->op_file); if (open->op_stp) nfs4_put_stid(&open->op_stp->st_stid); if (open->op_odstate) kmem_cache_free(odstate_slab, open->op_odstate); } __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { clientid_t *clid = &u->renew; struct nfs4_client *clp; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); trace_nfsd_clid_renew(clid); status = set_client(clid, cstate, nn); if (status) return status; clp = cstate->clp; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) return nfserr_cb_path_down; return nfs_ok; } void nfsd4_end_grace(struct nfsd_net *nn) { /* do nothing if grace period already ended */ if (nn->grace_ended) return; trace_nfsd_grace_complete(nn); nn->grace_ended = true; /* * If the server goes down again right now, an NFSv4 * client will still be allowed to reclaim after it comes back up, * even if it hasn't yet had a chance to reclaim state this time. * */ nfsd4_record_grace_done(nn); /* * At this point, NFSv4 clients can still reclaim. But if the * server crashes, any that have not yet reclaimed will be out * of luck on the next boot. * * (NFSv4.1+ clients are considered to have reclaimed once they * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to * have reclaimed after their first OPEN.) */ locks_end_grace(&nn->nfsd4_manager); /* * At this point, and once lockd and/or any other containers * exit their grace period, further reclaims will fail and * regular locking can resume. */ } /* * If we've waited a lease period but there are still clients trying to * reclaim, wait a little longer to give them a chance to finish. */ static bool clients_still_reclaiming(struct nfsd_net *nn) { time64_t double_grace_period_end = nn->boot_time + 2 * nn->nfsd4_lease; if (nn->track_reclaim_completes && atomic_read(&nn->nr_reclaim_complete) == nn->reclaim_str_hashtbl_size) return false; if (!nn->somebody_reclaimed) return false; nn->somebody_reclaimed = false; /* * If we've given them *two* lease times to reclaim, and they're * still not done, give up: */ if (ktime_get_boottime_seconds() > double_grace_period_end) return false; return true; } struct laundry_time { time64_t cutoff; time64_t new_timeo; }; static bool state_expired(struct laundry_time *lt, time64_t last_refresh) { time64_t time_remaining; if (last_refresh < lt->cutoff) return true; time_remaining = last_refresh - lt->cutoff; lt->new_timeo = min(lt->new_timeo, time_remaining); return false; } #ifdef CONFIG_NFSD_V4_2_INTER_SSC void nfsd4_ssc_init_umount_work(struct nfsd_net *nn) { spin_lock_init(&nn->nfsd_ssc_lock); INIT_LIST_HEAD(&nn->nfsd_ssc_mount_list); init_waitqueue_head(&nn->nfsd_ssc_waitq); } /* * This is called when nfsd is being shutdown, after all inter_ssc * cleanup were done, to destroy the ssc delayed unmount list. */ static void nfsd4_ssc_shutdown_umount(struct nfsd_net *nn) { struct nfsd4_ssc_umount_item *ni = NULL; struct nfsd4_ssc_umount_item *tmp; spin_lock(&nn->nfsd_ssc_lock); list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { list_del(&ni->nsui_list); spin_unlock(&nn->nfsd_ssc_lock); mntput(ni->nsui_vfsmount); kfree(ni); spin_lock(&nn->nfsd_ssc_lock); } spin_unlock(&nn->nfsd_ssc_lock); } static void nfsd4_ssc_expire_umount(struct nfsd_net *nn) { bool do_wakeup = false; struct nfsd4_ssc_umount_item *ni = NULL; struct nfsd4_ssc_umount_item *tmp; spin_lock(&nn->nfsd_ssc_lock); list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { if (time_after(jiffies, ni->nsui_expire)) { if (refcount_read(&ni->nsui_refcnt) > 1) continue; /* mark being unmount */ ni->nsui_busy = true; spin_unlock(&nn->nfsd_ssc_lock); mntput(ni->nsui_vfsmount); spin_lock(&nn->nfsd_ssc_lock); /* waiters need to start from begin of list */ list_del(&ni->nsui_list); kfree(ni); /* wakeup ssc_connect waiters */ do_wakeup = true; continue; } break; } if (do_wakeup) wake_up_all(&nn->nfsd_ssc_waitq); spin_unlock(&nn->nfsd_ssc_lock); } #endif /* Check if any lock belonging to this lockowner has any blockers */ static bool nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo) { struct file_lock_context *ctx; struct nfs4_ol_stateid *stp; struct nfs4_file *nf; list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { nf = stp->st_stid.sc_file; ctx = locks_inode_context(nf->fi_inode); if (!ctx) continue; if (locks_owner_has_blockers(ctx, lo)) return true; } return false; } static bool nfs4_anylock_blockers(struct nfs4_client *clp) { int i; struct nfs4_stateowner *so; struct nfs4_lockowner *lo; if (atomic_read(&clp->cl_delegs_in_recall)) return true; spin_lock(&clp->cl_lock); for (i = 0; i < OWNER_HASH_SIZE; i++) { list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[i], so_strhash) { if (so->so_is_open_owner) continue; lo = lockowner(so); if (nfs4_lockowner_has_blockers(lo)) { spin_unlock(&clp->cl_lock); return true; } } } spin_unlock(&clp->cl_lock); return false; } static void nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, struct laundry_time *lt) { unsigned int maxreap, reapcnt = 0; struct list_head *pos, *next; struct nfs4_client *clp; maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ? NFSD_CLIENT_MAX_TRIM_PER_RUN : 0; INIT_LIST_HEAD(reaplist); spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (clp->cl_state == NFSD4_EXPIRABLE) goto exp_client; if (!state_expired(lt, clp->cl_time)) break; if (!atomic_read(&clp->cl_rpc_users)) { if (clp->cl_state == NFSD4_ACTIVE) atomic_inc(&nn->nfsd_courtesy_clients); clp->cl_state = NFSD4_COURTESY; } if (!client_has_state(clp)) goto exp_client; if (!nfs4_anylock_blockers(clp)) if (reapcnt >= maxreap) continue; exp_client: if (!mark_client_expired_locked(clp)) { list_add(&clp->cl_lru, reaplist); reapcnt++; } } spin_unlock(&nn->client_lock); } static void nfs4_get_courtesy_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist) { unsigned int maxreap = 0, reapcnt = 0; struct list_head *pos, *next; struct nfs4_client *clp; maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN; INIT_LIST_HEAD(reaplist); spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (clp->cl_state == NFSD4_ACTIVE) break; if (reapcnt >= maxreap) break; if (!mark_client_expired_locked(clp)) { list_add(&clp->cl_lru, reaplist); reapcnt++; } } spin_unlock(&nn->client_lock); } static void nfs4_process_client_reaplist(struct list_head *reaplist) { struct list_head *pos, *next; struct nfs4_client *clp; list_for_each_safe(pos, next, reaplist) { clp = list_entry(pos, struct nfs4_client, cl_lru); trace_nfsd_clid_purged(&clp->cl_clientid); list_del_init(&clp->cl_lru); expire_client(clp); } } static void nfs40_clean_admin_revoked(struct nfsd_net *nn, struct laundry_time *lt) { struct nfs4_client *clp; spin_lock(&nn->client_lock); if (nn->nfs40_last_revoke == 0 || nn->nfs40_last_revoke > lt->cutoff) { spin_unlock(&nn->client_lock); return; } nn->nfs40_last_revoke = 0; retry: list_for_each_entry(clp, &nn->client_lru, cl_lru) { unsigned long id, tmp; struct nfs4_stid *stid; if (atomic_read(&clp->cl_admin_revoked) == 0) continue; spin_lock(&clp->cl_lock); idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { refcount_inc(&stid->sc_count); spin_unlock(&nn->client_lock); /* this function drops ->cl_lock */ nfsd4_drop_revoked_stid(stid); nfs4_put_stid(stid); spin_lock(&nn->client_lock); goto retry; } spin_unlock(&clp->cl_lock); } spin_unlock(&nn->client_lock); } static time64_t nfs4_laundromat(struct nfsd_net *nn) { struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct nfs4_ol_stateid *stp; struct nfsd4_blocked_lock *nbl; struct list_head *pos, *next, reaplist; struct laundry_time lt = { .cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease, .new_timeo = nn->nfsd4_lease }; struct nfs4_cpntf_state *cps; copy_stateid_t *cps_t; int i; if (clients_still_reclaiming(nn)) { lt.new_timeo = 0; goto out; } nfsd4_end_grace(nn); spin_lock(&nn->s2s_cp_lock); idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); if (cps->cp_stateid.cs_type == NFS4_COPYNOTIFY_STID && state_expired(&lt, cps->cpntf_time)) _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); nfsd4_async_copy_reaper(nn); nfs4_get_client_reaplist(nn, &reaplist, &lt); nfs4_process_client_reaplist(&reaplist); nfs40_clean_admin_revoked(nn, &lt); spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (!state_expired(&lt, dp->dl_time)) break; refcount_inc(&dp->dl_stid.sc_count); unhash_delegation_locked(dp, SC_STATUS_REVOKED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); while (!list_empty(&reaplist)) { dp = list_first_entry(&reaplist, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); revoke_delegation(dp); } spin_lock(&nn->client_lock); while (!list_empty(&nn->close_lru)) { oo = list_first_entry(&nn->close_lru, struct nfs4_openowner, oo_close_lru); if (!state_expired(&lt, oo->oo_time)) break; list_del_init(&oo->oo_close_lru); stp = oo->oo_last_closed_stid; oo->oo_last_closed_stid = NULL; spin_unlock(&nn->client_lock); nfs4_put_stid(&stp->st_stid); spin_lock(&nn->client_lock); } spin_unlock(&nn->client_lock); /* * It's possible for a client to try and acquire an already held lock * that is being held for a long time, and then lose interest in it. * So, we clean out any un-revisited request after a lease period * under the assumption that the client is no longer interested. * * RFC5661, sec. 9.6 states that the client must not rely on getting * notifications and must continue to poll for locks, even when the * server supports them. Thus this shouldn't lead to clients blocking * indefinitely once the lock does become free. */ BUG_ON(!list_empty(&reaplist)); spin_lock(&nn->blocked_locks_lock); while (!list_empty(&nn->blocked_locks_lru)) { nbl = list_first_entry(&nn->blocked_locks_lru, struct nfsd4_blocked_lock, nbl_lru); if (!state_expired(&lt, nbl->nbl_time)) break; list_move(&nbl->nbl_lru, &reaplist); list_del_init(&nbl->nbl_list); } spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); free_blocked_lock(nbl); } #ifdef CONFIG_NFSD_V4_2_INTER_SSC /* service the server-to-server copy delayed unmount list */ nfsd4_ssc_expire_umount(nn); #endif if (atomic_long_read(&num_delegations) >= max_delegations) deleg_reaper(nn); out: return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } static void laundromat_main(struct work_struct *); static void laundromat_main(struct work_struct *laundry) { time64_t t; struct delayed_work *dwork = to_delayed_work(laundry); struct nfsd_net *nn = container_of(dwork, struct nfsd_net, laundromat_work); t = nfs4_laundromat(nn); queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); } static void courtesy_client_reaper(struct nfsd_net *nn) { struct list_head reaplist; nfs4_get_courtesy_client_reaplist(nn, &reaplist); nfs4_process_client_reaplist(&reaplist); } static void deleg_reaper(struct nfsd_net *nn) { struct list_head *pos, *next; struct nfs4_client *clp; LIST_HEAD(cblist); spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (clp->cl_state != NFSD4_ACTIVE || list_empty(&clp->cl_delegations) || atomic_read(&clp->cl_delegs_in_recall) || test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) || (ktime_get_boottime_seconds() - clp->cl_ra_time < 5)) { continue; } list_add(&clp->cl_ra_cblist, &cblist); /* release in nfsd4_cb_recall_any_release */ kref_get(&clp->cl_nfsdfs.cl_ref); set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); clp->cl_ra_time = ktime_get_boottime_seconds(); } spin_unlock(&nn->client_lock); while (!list_empty(&cblist)) { clp = list_first_entry(&cblist, struct nfs4_client, cl_ra_cblist); list_del_init(&clp->cl_ra_cblist); clp->cl_ra->ra_keep = 0; clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) | BIT(RCA4_TYPE_MASK_WDATA_DLG); trace_nfsd_cb_recall_any(clp->cl_ra); nfsd4_run_cb(&clp->cl_ra->ra_cb); } } static void nfsd4_state_shrinker_worker(struct work_struct *work) { struct nfsd_net *nn = container_of(work, struct nfsd_net, nfsd_shrinker_work); courtesy_client_reaper(nn); deleg_reaper(nn); } static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) { if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) return nfserr_bad_stateid; return nfs_ok; } static __be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags) { __be32 status = nfserr_openmode; /* For lock stateid's, we test the parent open, not the lock: */ if (stp->st_openstp) stp = stp->st_openstp; if ((flags & WR_STATE) && !access_permit_write(stp)) goto out; if ((flags & RD_STATE) && !access_permit_read(stp)) goto out; status = nfs_ok; out: return status; } static inline __be32 check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, int flags) { if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; else if (opens_in_grace(net)) { /* Answer in remaining cases depends on existence of * conflicting state; so we must wait out the grace period. */ return nfserr_grace; } else if (flags & WR_STATE) return nfs4_share_conflict(current_fh, NFS4_SHARE_DENY_WRITE); else /* (flags & RD_STATE) && ZERO_STATEID(stateid) */ return nfs4_share_conflict(current_fh, NFS4_SHARE_DENY_READ); } static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* * When sessions are used the stateid generation number is ignored * when it is zero. */ if (has_session && in->si_generation == 0) return nfs_ok; if (in->si_generation == ref->si_generation) return nfs_ok; /* If the client sends us a stateid from the future, it's buggy: */ if (nfsd4_stateid_generation_after(in, ref)) return nfserr_bad_stateid; /* * However, we could see a stateid from the past, even from a * non-buggy client. For example, if the client sends a lock * while some IO is outstanding, the lock may bump si_generation * while the IO is still in flight. The client could avoid that * situation by waiting for responses on all the IO requests, * but better performance may result in retrying IO that * receives an old_stateid error if requests are rarely * reordered in flight: */ return nfserr_old_stateid; } static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_stid *s, bool has_session) { __be32 ret; spin_lock(&s->sc_lock); ret = nfsd4_verify_open_stid(s); if (ret == nfs_ok) ret = check_stateid_generation(in, &s->sc_stateid, has_session); spin_unlock(&s->sc_lock); if (ret == nfserr_admin_revoked) nfsd40_drop_revoked_stid(s->sc_client, &s->sc_stateid); return ret; } static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols) { if (ols->st_stateowner->so_is_open_owner && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) return nfserr_bad_stateid; return nfs_ok; } static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { struct nfs4_stid *s; __be32 status = nfserr_bad_stateid; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return status; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); if (!s) goto out_unlock; status = nfsd4_stid_check_stateid_generation(stateid, s, 1); if (status) goto out_unlock; status = nfsd4_verify_open_stid(s); if (status) goto out_unlock; switch (s->sc_type) { case SC_TYPE_DELEG: status = nfs_ok; break; case SC_TYPE_OPEN: case SC_TYPE_LOCK: status = nfsd4_check_openowner_confirmed(openlockstateid(s)); break; default: printk("unknown stateid type %x\n", s->sc_type); status = nfserr_bad_stateid; } out_unlock: spin_unlock(&cl->cl_lock); if (status == nfserr_admin_revoked) nfsd40_drop_revoked_stid(cl, stateid); return status; } __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned short typemask, unsigned short statusmask, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; struct nfs4_stid *stid; bool return_revoked = false; /* * only return revoked delegations if explicitly asked. * otherwise we report revoked or bad_stateid status. */ if (statusmask & SC_STATUS_REVOKED) return_revoked = true; if (typemask & SC_TYPE_DELEG) /* Always allow REVOKED for DELEG so we can * retturn the appropriate error. */ statusmask |= SC_STATUS_REVOKED; statusmask |= SC_STATUS_ADMIN_REVOKED; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return nfserr_bad_stateid; status = set_client(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { if (cstate->session) return nfserr_bad_stateid; return nfserr_stale_stateid; } if (status) return status; stid = find_stateid_by_type(cstate->clp, stateid, typemask, statusmask); if (!stid) return nfserr_bad_stateid; if ((stid->sc_status & SC_STATUS_REVOKED) && !return_revoked) { nfs4_put_stid(stid); return nfserr_deleg_revoked; } if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { nfsd40_drop_revoked_stid(cstate->clp, stateid); nfs4_put_stid(stid); return nfserr_admin_revoked; } *s = stid; return nfs_ok; } static struct nfsd_file * nfs4_find_file(struct nfs4_stid *s, int flags) { struct nfsd_file *ret = NULL; if (!s || s->sc_status) return NULL; switch (s->sc_type) { case SC_TYPE_DELEG: spin_lock(&s->sc_file->fi_lock); ret = nfsd_file_get(s->sc_file->fi_deleg_file); spin_unlock(&s->sc_file->fi_lock); break; case SC_TYPE_OPEN: case SC_TYPE_LOCK: if (flags & RD_STATE) ret = find_readable_file(s->sc_file); else ret = find_writeable_file(s->sc_file); } return ret; } static __be32 nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags) { __be32 status; status = nfsd4_check_openowner_confirmed(ols); if (status) return status; return nfs4_check_openmode(ols, flags); } static __be32 nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, struct nfsd_file **nfp, int flags) { int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; struct nfsd_file *nf; __be32 status; nf = nfs4_find_file(s, flags); if (nf) { status = nfsd_permission(&rqstp->rq_cred, fhp->fh_export, fhp->fh_dentry, acc | NFSD_MAY_OWNER_OVERRIDE); if (status) { nfsd_file_put(nf); goto out; } } else { status = nfsd_file_acquire(rqstp, fhp, acc, &nf); if (status) return status; } *nfp = nf; out: return status; } static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) { WARN_ON_ONCE(cps->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID); if (!refcount_dec_and_test(&cps->cp_stateid.cs_count)) return; list_del(&cps->cp_list); idr_remove(&nn->s2s_cp_stateids, cps->cp_stateid.cs_stid.si_opaque.so_id); kfree(cps); } /* * A READ from an inter server to server COPY will have a * copy stateid. Look up the copy notify stateid from the * idr structure and take a reference on it. */ __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st, struct nfs4_client *clp, struct nfs4_cpntf_state **cps) { copy_stateid_t *cps_t; struct nfs4_cpntf_state *state = NULL; if (st->si_opaque.so_clid.cl_id != nn->s2s_cp_cl_id) return nfserr_bad_stateid; spin_lock(&nn->s2s_cp_lock); cps_t = idr_find(&nn->s2s_cp_stateids, st->si_opaque.so_id); if (cps_t) { state = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); if (state->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID) { state = NULL; goto unlock; } if (!clp) refcount_inc(&state->cp_stateid.cs_count); else _free_cpntf_state_locked(nn, state); } unlock: spin_unlock(&nn->s2s_cp_lock); if (!state) return nfserr_bad_stateid; if (!clp) *cps = state; return 0; } static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st, struct nfs4_stid **stid) { __be32 status; struct nfs4_cpntf_state *cps = NULL; struct nfs4_client *found; status = manage_cpntf_state(nn, st, NULL, &cps); if (status) return status; cps->cpntf_time = ktime_get_boottime_seconds(); status = nfserr_expired; found = lookup_clientid(&cps->cp_p_clid, true, nn); if (!found) goto out; *stid = find_stateid_by_type(found, &cps->cp_p_stateid, SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, 0); if (*stid) status = nfs_ok; else status = nfserr_bad_stateid; put_client_renew(found); out: nfs4_put_cpntf_state(nn, cps); return status; } void nfs4_put_cpntf_state(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) { spin_lock(&nn->s2s_cp_lock); _free_cpntf_state_locked(nn, cps); spin_unlock(&nn->s2s_cp_lock); } /** * nfs4_preprocess_stateid_op - find and prep stateid for an operation * @rqstp: incoming request from client * @cstate: current compound state * @fhp: filehandle associated with requested stateid * @stateid: stateid (provided by client) * @flags: flags describing type of operation to be done * @nfp: optional nfsd_file return pointer (may be NULL) * @cstid: optional returned nfs4_stid pointer (may be NULL) * * Given info from the client, look up a nfs4_stid for the operation. On * success, it returns a reference to the nfs4_stid and/or the nfsd_file * associated with it. */ __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, stateid_t *stateid, int flags, struct nfsd_file **nfp, struct nfs4_stid **cstid) { struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct nfs4_stid *s = NULL; __be32 status; if (nfp) *nfp = NULL; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { status = check_special_stateids(net, fhp, stateid, flags); goto done; } status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, 0, &s, nn); if (status == nfserr_bad_stateid) status = find_cpntf_state(nn, stateid, &s); if (status) return status; status = nfsd4_stid_check_stateid_generation(stateid, s, nfsd4_has_session(cstate)); if (status) goto out; switch (s->sc_type) { case SC_TYPE_DELEG: status = nfs4_check_delegmode(delegstateid(s), flags); break; case SC_TYPE_OPEN: case SC_TYPE_LOCK: status = nfs4_check_olstateid(openlockstateid(s), flags); break; } if (status) goto out; status = nfs4_check_fh(fhp, s); done: if (status == nfs_ok && nfp) status = nfs4_check_file(rqstp, fhp, s, nfp, flags); out: if (s) { if (!status && cstid) *cstid = s; else nfs4_put_stid(s); } return status; } /* * Test if the stateid is valid */ __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct nfsd4_test_stateid_id *stateid; struct nfs4_client *cl = cstate->clp; list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) stateid->ts_id_status = nfsd4_validate_stateid(cl, &stateid->ts_id_stateid); return nfs_ok; } static __be32 nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) { struct nfs4_ol_stateid *stp = openlockstateid(s); __be32 ret; ret = nfsd4_lock_ol_stateid(stp); if (ret) goto out_put_stid; ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) goto out; ret = nfserr_locks_held; if (check_for_locks(stp->st_stid.sc_file, lockowner(stp->st_stateowner))) goto out; release_lock_stateid(stp); ret = nfs_ok; out: mutex_unlock(&stp->st_mutex); out_put_stid: nfs4_put_stid(s); return ret; } __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_free_stateid *free_stateid = &u->free_stateid; stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; struct nfs4_client *cl = cstate->clp; __be32 ret = nfserr_bad_stateid; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); if (!s || s->sc_status & SC_STATUS_CLOSED) goto out_unlock; if (s->sc_status & SC_STATUS_ADMIN_REVOKED) { nfsd4_drop_revoked_stid(s); ret = nfs_ok; goto out; } spin_lock(&s->sc_lock); switch (s->sc_type) { case SC_TYPE_DELEG: if (s->sc_status & SC_STATUS_REVOKED) { s->sc_status |= SC_STATUS_CLOSED; spin_unlock(&s->sc_lock); dp = delegstateid(s); if (s->sc_status & SC_STATUS_FREEABLE) list_del_init(&dp->dl_recall_lru); s->sc_status |= SC_STATUS_FREED; spin_unlock(&cl->cl_lock); nfs4_put_stid(s); ret = nfs_ok; goto out; } ret = nfserr_locks_held; break; case SC_TYPE_OPEN: ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) break; ret = nfserr_locks_held; break; case SC_TYPE_LOCK: spin_unlock(&s->sc_lock); refcount_inc(&s->sc_count); spin_unlock(&cl->cl_lock); ret = nfsd4_free_lock_stateid(stateid, s); goto out; } spin_unlock(&s->sc_lock); out_unlock: spin_unlock(&cl->cl_lock); out: return ret; } static inline int setlkflg (int type) { return (type == NFS4_READW_LT || type == NFS4_READ_LT) ? RD_STATE : WR_STATE; } static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp) { struct svc_fh *current_fh = &cstate->current_fh; struct nfs4_stateowner *sop = stp->st_stateowner; __be32 status; status = nfsd4_check_seqid(cstate, sop, seqid); if (status) return status; status = nfsd4_lock_ol_stateid(stp); if (status != nfs_ok) return status; status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status == nfs_ok) status = nfs4_check_fh(current_fh, &stp->st_stid); if (status != nfs_ok) mutex_unlock(&stp->st_mutex); return status; } /** * nfs4_preprocess_seqid_op - find and prep an ol_stateid for a seqid-morphing op * @cstate: compund state * @seqid: seqid (provided by client) * @stateid: stateid (provided by client) * @typemask: mask of allowable types for this operation * @statusmask: mask of allowed states: 0 or STID_CLOSED * @stpp: return pointer for the stateid found * @nn: net namespace for request * * Given a stateid+seqid from a client, look up an nfs4_ol_stateid and * return it in @stpp. On a nfs_ok return, the returned stateid will * have its st_mutex locked. */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, unsigned short typemask, unsigned short statusmask, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn) { __be32 status; struct nfs4_stid *s; struct nfs4_ol_stateid *stp = NULL; trace_nfsd_preprocess(seqid, stateid); *stpp = NULL; retry: status = nfsd4_lookup_stateid(cstate, stateid, typemask, statusmask, &s, nn); if (status) return status; stp = openlockstateid(s); if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) { nfs4_put_stateowner(stp->st_stateowner); goto retry; } status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp); if (!status) *stpp = stp; else nfs4_put_stid(&stp->st_stid); return status; } static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn) { __be32 status; struct nfs4_openowner *oo; struct nfs4_ol_stateid *stp; status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, SC_TYPE_OPEN, 0, &stp, nn); if (status) return status; oo = openowner(stp->st_stateowner); if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); return nfserr_bad_stateid; } *stpp = stp; return nfs_ok; } __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_open_confirm *oc = &u->open_confirm; __be32 status; struct nfs4_openowner *oo; struct nfs4_ol_stateid *stp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); dprintk("NFSD: nfsd4_open_confirm on file %pd\n", cstate->current_fh.fh_dentry); status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); if (status) return status; status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, SC_TYPE_OPEN, 0, &stp, nn); if (status) goto out; oo = openowner(stp->st_stateowner); status = nfserr_bad_stateid; if (oo->oo_flags & NFS4_OO_CONFIRMED) { mutex_unlock(&stp->st_mutex); goto put_stateid; } oo->oo_flags |= NFS4_OO_CONFIRMED; nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid); mutex_unlock(&stp->st_mutex); trace_nfsd_open_confirm(oc->oc_seqid, &stp->st_stid.sc_stateid); nfsd4_client_record_create(oo->oo_owner.so_client); status = nfs_ok; put_stateid: nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); return status; } static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access) { if (!test_access(access, stp)) return; nfs4_file_put_access(stp->st_stid.sc_file, access); clear_access(access, stp); } static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access) { switch (to_access) { case NFS4_SHARE_ACCESS_READ: nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE); nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); break; case NFS4_SHARE_ACCESS_WRITE: nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ); nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); break; case NFS4_SHARE_ACCESS_BOTH: break; default: WARN_ON_ONCE(1); } } __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_open_downgrade *od = &u->open_downgrade; __be32 status; struct nfs4_ol_stateid *stp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); dprintk("NFSD: nfsd4_open_downgrade on file %pd\n", cstate->current_fh.fh_dentry); /* We don't yet support WANT bits: */ if (od->od_deleg_want) dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__, od->od_deleg_want); status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, &od->od_stateid, &stp, nn); if (status) goto out; status = nfserr_inval; if (!test_access(od->od_share_access, stp)) { dprintk("NFSD: access not a subset of current bitmap: 0x%hhx, input access=%08x\n", stp->st_access_bmap, od->od_share_access); goto put_stateid; } if (!test_deny(od->od_share_deny, stp)) { dprintk("NFSD: deny not a subset of current bitmap: 0x%hhx, input deny=%08x\n", stp->st_deny_bmap, od->od_share_deny); goto put_stateid; } nfs4_stateid_downgrade(stp, od->od_share_access); reset_union_bmap_deny(od->od_share_deny, stp); nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid); status = nfs_ok; put_stateid: mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); return status; } static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) { struct nfs4_client *clp = s->st_stid.sc_client; bool unhashed; LIST_HEAD(reaplist); struct nfs4_ol_stateid *stp; spin_lock(&clp->cl_lock); unhashed = unhash_open_stateid(s, &reaplist); if (clp->cl_minorversion) { if (unhashed) put_ol_stateid_locked(s, &reaplist); spin_unlock(&clp->cl_lock); list_for_each_entry(stp, &reaplist, st_locks) nfs4_free_cpntf_statelist(clp->net, &stp->st_stid); free_ol_stateid_reaplist(&reaplist); return false; } else { spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); return unhashed; } } /* * nfs4_unlock_state() called after encode */ __be32 nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_close *close = &u->close; __be32 status; struct nfs4_ol_stateid *stp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool need_move_to_close_list; dprintk("NFSD: nfsd4_close on file %pd\n", cstate->current_fh.fh_dentry); status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, &close->cl_stateid, SC_TYPE_OPEN, SC_STATUS_CLOSED, &stp, nn); nfsd4_bump_seqid(cstate, status); if (status) goto out; spin_lock(&stp->st_stid.sc_client->cl_lock); stp->st_stid.sc_status |= SC_STATUS_CLOSED; spin_unlock(&stp->st_stid.sc_client->cl_lock); /* * Technically we don't _really_ have to increment or copy it, since * it should just be gone after this operation and we clobber the * copied value below, but we continue to do so here just to ensure * that racing ops see that there was a state change. */ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); need_move_to_close_list = nfsd4_close_open_stateid(stp); mutex_unlock(&stp->st_mutex); if (need_move_to_close_list) move_to_close_lru(stp, net); /* v4.1+ suggests that we send a special stateid in here, since the * clients should just ignore this anyway. Since this is not useful * for v4.0 clients either, we set it to the special close_stateid * universally. * * See RFC5661 section 18.2.4, and RFC7530 section 16.2.5 */ memcpy(&close->cl_stateid, &close_stateid, sizeof(close->cl_stateid)); /* put reference from nfs4_preprocess_seqid_op */ nfs4_put_stid(&stp->st_stid); out: return status; } __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_delegreturn *dr = &u->delegreturn; struct nfs4_delegation *dp; stateid_t *stateid = &dr->dr_stateid; struct nfs4_stid *s; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED | SC_STATUS_FREEABLE, &s, nn); if (status) goto out; dp = delegstateid(s); status = nfsd4_stid_check_stateid_generation(stateid, &dp->dl_stid, nfsd4_has_session(cstate)); if (status) goto put_stateid; trace_nfsd_deleg_return(stateid); destroy_delegation(dp); smp_mb__after_atomic(); wake_up_var(d_inode(cstate->current_fh.fh_dentry)); put_stateid: nfs4_put_stid(&dp->dl_stid); out: return status; } /* last octet in a range */ static inline u64 last_byte_offset(u64 start, u64 len) { u64 end; WARN_ON_ONCE(!len); end = start + len; return end > start ? end - 1: NFS4_MAX_UINT64; } /* * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that * we can't properly handle lock requests that go beyond the (2^63 - 1)-th * byte, because of sign extension problems. Since NFSv4 calls for 64-bit * locking, this prevents us from being completely protocol-compliant. The * real solution to this problem is to start using unsigned file offsets in * the VFS, but this is a very deep change! */ static inline void nfs4_transform_lock_offset(struct file_lock *lock) { if (lock->fl_start < 0) lock->fl_start = OFFSET_MAX; if (lock->fl_end < 0) lock->fl_end = OFFSET_MAX; } static fl_owner_t nfsd4_lm_get_owner(fl_owner_t owner) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; nfs4_get_stateowner(&lo->lo_owner); return owner; } static void nfsd4_lm_put_owner(fl_owner_t owner) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; if (lo) nfs4_put_stateowner(&lo->lo_owner); } /* return pointer to struct nfs4_client if client is expirable */ static bool nfsd4_lm_lock_expirable(struct file_lock *cfl) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *) cfl->c.flc_owner; struct nfs4_client *clp = lo->lo_owner.so_client; struct nfsd_net *nn; if (try_to_expire_client(clp)) { nn = net_generic(clp->net, nfsd_net_id); mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); return true; } return false; } /* schedule laundromat to run immediately and wait for it to complete */ static void nfsd4_lm_expire_lock(void) { flush_workqueue(laundry_wq); } static void nfsd4_lm_notify(struct file_lock *fl) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *) fl->c.flc_owner; struct net *net = lo->lo_owner.so_client->net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct nfsd4_blocked_lock *nbl = container_of(fl, struct nfsd4_blocked_lock, nbl_lock); bool queue = false; /* An empty list means that something else is going to be using it */ spin_lock(&nn->blocked_locks_lock); if (!list_empty(&nbl->nbl_list)) { list_del_init(&nbl->nbl_list); list_del_init(&nbl->nbl_lru); queue = true; } spin_unlock(&nn->blocked_locks_lock); if (queue) { trace_nfsd_cb_notify_lock(lo, nbl); nfsd4_run_cb(&nbl->nbl_cb); } } static const struct lock_manager_operations nfsd_posix_mng_ops = { .lm_mod_owner = THIS_MODULE, .lm_notify = nfsd4_lm_notify, .lm_get_owner = nfsd4_lm_get_owner, .lm_put_owner = nfsd4_lm_put_owner, .lm_lock_expirable = nfsd4_lm_lock_expirable, .lm_expire_lock = nfsd4_lm_expire_lock, }; static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { struct nfs4_lockowner *lo; if (fl->fl_lmops == &nfsd_posix_mng_ops) { lo = (struct nfs4_lockowner *) fl->c.flc_owner; xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner, GFP_KERNEL); if (!deny->ld_owner.data) /* We just don't care that much */ goto nevermind; deny->ld_clientid = lo->lo_owner.so_client->cl_clientid; } else { nevermind: deny->ld_owner.len = 0; deny->ld_owner.data = NULL; deny->ld_clientid.cl_boot = 0; deny->ld_clientid.cl_id = 0; } deny->ld_start = fl->fl_start; deny->ld_length = NFS4_MAX_UINT64; if (fl->fl_end != NFS4_MAX_UINT64) deny->ld_length = fl->fl_end - fl->fl_start + 1; deny->ld_type = NFS4_READ_LT; if (fl->c.flc_type != F_RDLCK) deny->ld_type = NFS4_WRITE_LT; } static struct nfs4_lockowner * find_lockowner_str_locked(struct nfs4_client *clp, struct xdr_netobj *owner) { unsigned int strhashval = ownerstr_hashval(owner); struct nfs4_stateowner *so; lockdep_assert_held(&clp->cl_lock); list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[strhashval], so_strhash) { if (so->so_is_open_owner) continue; if (same_owner_str(so, owner)) return lockowner(nfs4_get_stateowner(so)); } return NULL; } static struct nfs4_lockowner * find_lockowner_str(struct nfs4_client *clp, struct xdr_netobj *owner) { struct nfs4_lockowner *lo; spin_lock(&clp->cl_lock); lo = find_lockowner_str_locked(clp, owner); spin_unlock(&clp->cl_lock); return lo; } static void nfs4_unhash_lockowner(struct nfs4_stateowner *sop) { unhash_lockowner_locked(lockowner(sop)); } static void nfs4_free_lockowner(struct nfs4_stateowner *sop) { struct nfs4_lockowner *lo = lockowner(sop); kmem_cache_free(lockowner_slab, lo); } static const struct nfs4_stateowner_operations lockowner_ops = { .so_unhash = nfs4_unhash_lockowner, .so_free = nfs4_free_lockowner, }; /* * Alloc a lock owner structure. * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has * occurred. * * strhashval = ownerstr_hashval */ static struct nfs4_lockowner * alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) { struct nfs4_lockowner *lo, *ret; lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); if (!lo) return NULL; INIT_LIST_HEAD(&lo->lo_blocked); INIT_LIST_HEAD(&lo->lo_owner.so_stateids); lo->lo_owner.so_is_open_owner = 0; lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; lo->lo_owner.so_ops = &lockowner_ops; spin_lock(&clp->cl_lock); ret = find_lockowner_str_locked(clp, &lock->lk_new_owner); if (ret == NULL) { list_add(&lo->lo_owner.so_strhash, &clp->cl_ownerstr_hashtbl[strhashval]); ret = lo; } else nfs4_free_stateowner(&lo->lo_owner); spin_unlock(&clp->cl_lock); return ret; } static struct nfs4_ol_stateid * find_lock_stateid(const struct nfs4_lockowner *lo, const struct nfs4_ol_stateid *ost) { struct nfs4_ol_stateid *lst; lockdep_assert_held(&ost->st_stid.sc_client->cl_lock); /* If ost is not hashed, ost->st_locks will not be valid */ if (!nfs4_ol_stateid_unhashed(ost)) list_for_each_entry(lst, &ost->st_locks, st_locks) { if (lst->st_stateowner == &lo->lo_owner) { refcount_inc(&lst->st_stid.sc_count); return lst; } } return NULL; } static struct nfs4_ol_stateid * init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, struct nfs4_file *fp, struct inode *inode, struct nfs4_ol_stateid *open_stp) { struct nfs4_client *clp = lo->lo_owner.so_client; struct nfs4_ol_stateid *retstp; mutex_init(&stp->st_mutex); mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); retry: spin_lock(&clp->cl_lock); if (nfs4_ol_stateid_unhashed(open_stp)) goto out_close; retstp = find_lock_stateid(lo, open_stp); if (retstp) goto out_found; refcount_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = SC_TYPE_LOCK; stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; spin_lock(&fp->fi_lock); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); spin_unlock(&fp->fi_lock); spin_unlock(&clp->cl_lock); return stp; out_found: spin_unlock(&clp->cl_lock); if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { nfs4_put_stid(&retstp->st_stid); goto retry; } /* To keep mutex tracking happy */ mutex_unlock(&stp->st_mutex); return retstp; out_close: spin_unlock(&clp->cl_lock); mutex_unlock(&stp->st_mutex); return NULL; } static struct nfs4_ol_stateid * find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, struct inode *inode, struct nfs4_ol_stateid *ost, bool *new) { struct nfs4_stid *ns = NULL; struct nfs4_ol_stateid *lst; struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *clp = oo->oo_owner.so_client; *new = false; spin_lock(&clp->cl_lock); lst = find_lock_stateid(lo, ost); spin_unlock(&clp->cl_lock); if (lst != NULL) { if (nfsd4_lock_ol_stateid(lst) == nfs_ok) goto out; nfs4_put_stid(&lst->st_stid); } ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); if (ns == NULL) return NULL; lst = init_lock_stateid(openlockstateid(ns), lo, fi, inode, ost); if (lst == openlockstateid(ns)) *new = true; else nfs4_put_stid(ns); out: return lst; } static int check_lock_length(u64 offset, u64 length) { return ((length == 0) || ((length != NFS4_MAX_UINT64) && (length > ~offset))); } static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) { struct nfs4_file *fp = lock_stp->st_stid.sc_file; lockdep_assert_held(&fp->fi_lock); if (test_access(access, lock_stp)) return; __nfs4_file_get_access(fp, access); set_access(access, lock_stp); } static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **plst, bool *new) { __be32 status; struct nfs4_file *fi = ost->st_stid.sc_file; struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *cl = oo->oo_owner.so_client; struct inode *inode = d_inode(cstate->current_fh.fh_dentry); struct nfs4_lockowner *lo; struct nfs4_ol_stateid *lst; unsigned int strhashval; lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { strhashval = ownerstr_hashval(&lock->lk_new_owner); lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); if (lo == NULL) return nfserr_jukebox; } else { /* with an existing lockowner, seqids must be the same */ status = nfserr_bad_seqid; if (!cstate->minorversion && lock->lk_new_lock_seqid != lo->lo_owner.so_seqid) goto out; } lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); if (lst == NULL) { status = nfserr_jukebox; goto out; } status = nfs_ok; *plst = lst; out: nfs4_put_stateowner(&lo->lo_owner); return status; } /* * LOCK operation */ __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_lock *lock = &u->lock; struct nfs4_openowner *open_sop = NULL; struct nfs4_lockowner *lock_sop = NULL; struct nfs4_ol_stateid *lock_stp = NULL; struct nfs4_ol_stateid *open_stp = NULL; struct nfs4_file *fp; struct nfsd_file *nf = NULL; struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; __be32 status = 0; int lkflg; int err; bool new = false; unsigned char type; unsigned int flags = FL_POSIX; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", (long long) lock->lk_offset, (long long) lock->lk_length); if (check_lock_length(lock->lk_offset, lock->lk_length)) return nfserr_inval; status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); if (status != nfs_ok) return status; if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) /* See rfc 5661 18.10.3: given clientid is ignored: */ memcpy(&lock->lk_new_clientid, &cstate->clp->cl_clientid, sizeof(clientid_t)); /* validate and update open stateid and open seqid */ status = nfs4_preprocess_confirmed_seqid_op(cstate, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, &open_stp, nn); if (status) goto out; mutex_unlock(&open_stp->st_mutex); open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, &lock->lk_new_clientid)) goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, SC_TYPE_LOCK, 0, &lock_stp, nn); } if (status) goto out; lock_sop = lockowner(lock_stp->st_stateowner); lkflg = setlkflg(lock->lk_type); status = nfs4_check_openmode(lock_stp, lkflg); if (status) goto out; status = nfserr_grace; if (locks_in_grace(net) && !lock->lk_reclaim) goto out; status = nfserr_no_grace; if (!locks_in_grace(net) && lock->lk_reclaim) goto out; if (lock->lk_reclaim) flags |= FL_RECLAIM; fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); nf = find_readable_file_locked(fp); if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); type = F_RDLCK; break; case NFS4_WRITEW_LT: fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); nf = find_writeable_file_locked(fp); if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); type = F_WRLCK; break; default: status = nfserr_inval; goto out; } if (!nf) { status = nfserr_openmode; goto out; } if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) && nfsd4_has_session(cstate) && locks_can_async_lock(nf->nf_file->f_op)) flags |= FL_SLEEP; nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { dprintk("NFSD: %s: unable to allocate block!\n", __func__); status = nfserr_jukebox; goto out; } file_lock = &nbl->nbl_lock; file_lock->c.flc_type = type; file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->c.flc_pid = current->tgid; file_lock->c.flc_file = nf->nf_file; file_lock->c.flc_flags = flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); nfs4_transform_lock_offset(file_lock); conflock = locks_alloc_lock(); if (!conflock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; goto out; } if (flags & FL_SLEEP) { nbl->nbl_time = ktime_get_boottime_seconds(); spin_lock(&nn->blocked_locks_lock); list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); kref_get(&nbl->nbl_kref); spin_unlock(&nn->blocked_locks_lock); } err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock); switch (err) { case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); status = 0; if (lock->lk_reclaim) nn->somebody_reclaimed = true; break; case FILE_LOCK_DEFERRED: kref_put(&nbl->nbl_kref, free_nbl); nbl = NULL; fallthrough; case -EAGAIN: /* conflock holds conflicting lock */ status = nfserr_denied; dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); nfs4_set_lock_denied(conflock, &lock->lk_denied); break; case -EDEADLK: status = nfserr_deadlock; break; default: dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err); status = nfserrno(err); break; } out: if (nbl) { /* dequeue it if we queued it before */ if (flags & FL_SLEEP) { spin_lock(&nn->blocked_locks_lock); if (!list_empty(&nbl->nbl_list) && !list_empty(&nbl->nbl_lru)) { list_del_init(&nbl->nbl_list); list_del_init(&nbl->nbl_lru); kref_put(&nbl->nbl_kref, free_nbl); } /* nbl can use one of lists to be linked to reaplist */ spin_unlock(&nn->blocked_locks_lock); } free_blocked_lock(nbl); } if (nf) nfsd_file_put(nf); if (lock_stp) { /* Bump seqid manually if the 4.0 replay owner is openowner */ if (cstate->replay_owner && cstate->replay_owner != &lock_sop->lo_owner && seqid_mutating_err(ntohl(status))) lock_sop->lo_owner.so_seqid++; /* * If this is a new, never-before-used stateid, and we are * returning an error, then just go ahead and release it. */ if (status && new) release_lock_stateid(lock_stp); mutex_unlock(&lock_stp->st_mutex); nfs4_put_stid(&lock_stp->st_stid); } if (open_stp) nfs4_put_stid(&open_stp->st_stid); nfsd4_bump_seqid(cstate, status); if (conflock) locks_free_lock(conflock); return status; } void nfsd4_lock_release(union nfsd4_op_u *u) { struct nfsd4_lock *lock = &u->lock; struct nfsd4_lock_denied *deny = &lock->lk_denied; kfree(deny->ld_owner.data); } /* * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, * so we do a temporary open here just to get an open file to pass to * vfs_test_lock. */ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { struct nfsd_file *nf; struct inode *inode; __be32 err; err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); if (err) return err; inode = fhp->fh_dentry->d_inode; inode_lock(inode); /* to block new leases till after test_lock: */ err = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (err) goto out; lock->c.flc_file = nf->nf_file; err = nfserrno(vfs_test_lock(nf->nf_file, lock)); lock->c.flc_file = NULL; out: inode_unlock(inode); nfsd_file_put(nf); return err; } /* * LOCKT operation */ __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_lockt *lockt = &u->lockt; struct file_lock *file_lock = NULL; struct nfs4_lockowner *lo = NULL; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (locks_in_grace(SVC_NET(rqstp))) return nfserr_grace; if (check_lock_length(lockt->lt_offset, lockt->lt_length)) return nfserr_inval; if (!nfsd4_has_session(cstate)) { status = set_client(&lockt->lt_clientid, cstate, nn); if (status) goto out; } if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) goto out; file_lock = locks_alloc_lock(); if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; goto out; } switch (lockt->lt_type) { case NFS4_READ_LT: case NFS4_READW_LT: file_lock->c.flc_type = F_RDLCK; break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: file_lock->c.flc_type = F_WRLCK; break; default: dprintk("NFSD: nfs4_lockt: bad lock type!\n"); status = nfserr_inval; goto out; } lo = find_lockowner_str(cstate->clp, &lockt->lt_owner); if (lo) file_lock->c.flc_owner = (fl_owner_t)lo; file_lock->c.flc_pid = current->tgid; file_lock->c.flc_flags = FL_POSIX; file_lock->fl_start = lockt->lt_offset; file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); nfs4_transform_lock_offset(file_lock); status = nfsd_test_lock(rqstp, &cstate->current_fh, file_lock); if (status) goto out; if (file_lock->c.flc_type != F_UNLCK) { status = nfserr_denied; nfs4_set_lock_denied(file_lock, &lockt->lt_denied); } out: if (lo) nfs4_put_stateowner(&lo->lo_owner); if (file_lock) locks_free_lock(file_lock); return status; } void nfsd4_lockt_release(union nfsd4_op_u *u) { struct nfsd4_lockt *lockt = &u->lockt; struct nfsd4_lock_denied *deny = &lockt->lt_denied; kfree(deny->ld_owner.data); } __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_locku *locku = &u->locku; struct nfs4_ol_stateid *stp; struct nfsd_file *nf = NULL; struct file_lock *file_lock = NULL; __be32 status; int err; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n", (long long) locku->lu_offset, (long long) locku->lu_length); if (check_lock_length(locku->lu_offset, locku->lu_length)) return nfserr_inval; status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, &locku->lu_stateid, SC_TYPE_LOCK, 0, &stp, nn); if (status) goto out; nf = find_any_file(stp->st_stid.sc_file); if (!nf) { status = nfserr_lock_range; goto put_stateid; } file_lock = locks_alloc_lock(); if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; goto put_file; } file_lock->c.flc_type = F_UNLCK; file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); file_lock->c.flc_pid = current->tgid; file_lock->c.flc_file = nf->nf_file; file_lock->c.flc_flags = FL_POSIX; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = locku->lu_offset; file_lock->fl_end = last_byte_offset(locku->lu_offset, locku->lu_length); nfs4_transform_lock_offset(file_lock); err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL); if (err) { dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); goto out_nfserr; } nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid); put_file: nfsd_file_put(nf); put_stateid: mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); if (file_lock) locks_free_lock(file_lock); return status; out_nfserr: status = nfserrno(err); goto put_file; } /* * returns * true: locks held by lockowner * false: no locks held by lockowner */ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; struct nfsd_file *nf; struct inode *inode; struct file_lock_context *flctx; spin_lock(&fp->fi_lock); nf = find_any_file_locked(fp); if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); goto out; } inode = file_inode(nf->nf_file); flctx = locks_inode_context(inode); if (flctx && !list_empty_careful(&flctx->flc_posix)) { spin_lock(&flctx->flc_lock); for_each_file_lock(fl, &flctx->flc_posix) { if (fl->c.flc_owner == (fl_owner_t)lowner) { status = true; break; } } spin_unlock(&flctx->flc_lock); } out: spin_unlock(&fp->fi_lock); return status; } /** * nfsd4_release_lockowner - process NFSv4.0 RELEASE_LOCKOWNER operations * @rqstp: RPC transaction * @cstate: NFSv4 COMPOUND state * @u: RELEASE_LOCKOWNER arguments * * Check if there are any locks still held and if not, free the lockowner * and any lock state that is owned. * * Return values: * %nfs_ok: lockowner released or not found * %nfserr_locks_held: lockowner still in use * %nfserr_stale_clientid: clientid no longer active * %nfserr_expired: clientid not recognized */ __be32 nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); clientid_t *clid = &rlockowner->rl_clientid; struct nfs4_ol_stateid *stp; struct nfs4_lockowner *lo; struct nfs4_client *clp; LIST_HEAD(reaplist); __be32 status; dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); status = set_client(clid, cstate, nn); if (status) return status; clp = cstate->clp; spin_lock(&clp->cl_lock); lo = find_lockowner_str_locked(clp, &rlockowner->rl_owner); if (!lo) { spin_unlock(&clp->cl_lock); return nfs_ok; } list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { if (check_for_locks(stp->st_stid.sc_file, lo)) { spin_unlock(&clp->cl_lock); nfs4_put_stateowner(&lo->lo_owner); return nfserr_locks_held; } } unhash_lockowner_locked(lo); while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); unhash_lock_stateid(stp); put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); remove_blocked_locks(lo); nfs4_put_stateowner(&lo->lo_owner); return nfs_ok; } static inline struct nfs4_client_reclaim * alloc_reclaim(void) { return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL); } bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn) { struct nfs4_client_reclaim *crp; crp = nfsd4_find_reclaim_client(name, nn); return (crp && crp->cr_clp); } /* * failure => all reset bets are off, nfserr_no_grace... * * The caller is responsible for freeing name.data if NULL is returned (it * will be freed in nfs4_remove_reclaim_record in the normal case). */ struct nfs4_client_reclaim * nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp; crp = alloc_reclaim(); if (crp) { strhashval = clientstr_hashval(name); INIT_LIST_HEAD(&crp->cr_strhash); list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]); crp->cr_name.data = name.data; crp->cr_name.len = name.len; crp->cr_princhash.data = princhash.data; crp->cr_princhash.len = princhash.len; crp->cr_clp = NULL; nn->reclaim_str_hashtbl_size++; } return crp; } void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn) { list_del(&crp->cr_strhash); kfree(crp->cr_name.data); kfree(crp->cr_princhash.data); kfree(crp); nn->reclaim_str_hashtbl_size--; } void nfs4_release_reclaim(struct nfsd_net *nn) { struct nfs4_client_reclaim *crp = NULL; int i; for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->reclaim_str_hashtbl[i])) { crp = list_entry(nn->reclaim_str_hashtbl[i].next, struct nfs4_client_reclaim, cr_strhash); nfs4_remove_reclaim_record(crp, nn); } } WARN_ON_ONCE(nn->reclaim_str_hashtbl_size); } /* * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ struct nfs4_client_reclaim * nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp = NULL; strhashval = clientstr_hashval(name); list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) { if (compare_blob(&crp->cr_name, &name) == 0) { return crp; } } return NULL; } __be32 nfs4_check_open_reclaim(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) return nfserr_no_grace; if (nfsd4_client_record_check(clp)) return nfserr_reclaim_bad; return nfs_ok; } /* * Since the lifetime of a delegation isn't limited to that of an open, a * client may quite reasonably hang on to a delegation as long as it has * the inode cached. This becomes an obvious problem the first time a * client's inode cache approaches the size of the server's total memory. * * For now we avoid this problem by imposing a hard limit on the number * of delegations, which varies according to the server's memory size. */ static void set_max_delegations(void) { /* * Allow at most 4 delegations per megabyte of RAM. Quick * estimates suggest that in the worst case (where every delegation * is for a different inode), a delegation could take about 1.5K, * giving a worst case usage of about 6% of memory. */ max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT); } static int nfs4_state_create_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; nn->conf_id_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!nn->conf_id_hashtbl) goto err; nn->unconf_id_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!nn->unconf_id_hashtbl) goto err_unconf_id; nn->sessionid_hashtbl = kmalloc_array(SESSION_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!nn->sessionid_hashtbl) goto err_sessionid; for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]); } for (i = 0; i < SESSION_HASH_SIZE; i++) INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; nn->boot_time = ktime_get_real_seconds(); nn->grace_ended = false; nn->nfsd4_manager.block_opens = true; INIT_LIST_HEAD(&nn->nfsd4_manager.list); INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); spin_lock_init(&nn->client_lock); spin_lock_init(&nn->s2s_cp_lock); idr_init(&nn->s2s_cp_stateids); atomic_set(&nn->pending_async_copies, 0); spin_lock_init(&nn->blocked_locks_lock); INIT_LIST_HEAD(&nn->blocked_locks_lru); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); nn->nfsd_client_shrinker = shrinker_alloc(0, "nfsd-client"); if (!nn->nfsd_client_shrinker) goto err_shrinker; nn->nfsd_client_shrinker->scan_objects = nfsd4_state_shrinker_scan; nn->nfsd_client_shrinker->count_objects = nfsd4_state_shrinker_count; nn->nfsd_client_shrinker->private_data = nn; shrinker_register(nn->nfsd_client_shrinker); return 0; err_shrinker: put_net(net); kfree(nn->sessionid_hashtbl); err_sessionid: kfree(nn->unconf_id_hashtbl); err_unconf_id: kfree(nn->conf_id_hashtbl); err: return -ENOMEM; } static void nfs4_state_destroy_net(struct net *net) { int i; struct nfs4_client *clp = NULL; struct nfsd_net *nn = net_generic(net, nfsd_net_id); for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->conf_id_hashtbl[i])) { clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); destroy_client(clp); } } WARN_ON(!list_empty(&nn->blocked_locks_lru)); for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->unconf_id_hashtbl[i])) { clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); destroy_client(clp); } } kfree(nn->sessionid_hashtbl); kfree(nn->unconf_id_hashtbl); kfree(nn->conf_id_hashtbl); put_net(net); } int nfs4_state_start_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; ret = nfs4_state_create_net(net); if (ret) return ret; locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0) goto skip_grace; printk(KERN_INFO "NFSD: starting %lld-second grace period (net %x)\n", nn->nfsd4_grace, net->ns.inum); trace_nfsd_grace_start(nn); queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); return 0; skip_grace: printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n", net->ns.inum); queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ); nfsd4_end_grace(nn); return 0; } /* initialization to perform when the nfsd service is started: */ int nfs4_state_start(void) { int ret; ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params); if (ret) return ret; nfsd_slot_shrinker = shrinker_alloc(0, "nfsd-DRC-slot"); if (!nfsd_slot_shrinker) { rhltable_destroy(&nfs4_file_rhltable); return -ENOMEM; } nfsd_slot_shrinker->count_objects = nfsd_slot_count; nfsd_slot_shrinker->scan_objects = nfsd_slot_scan; shrinker_register(nfsd_slot_shrinker); set_max_delegations(); return 0; } void nfs4_state_shutdown_net(struct net *net) { struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); shrinker_free(nn->nfsd_client_shrinker); cancel_work_sync(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); INIT_LIST_HEAD(&reaplist); spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); destroy_unhashed_deleg(dp); } nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_shutdown_umount(nn); #endif } void nfs4_state_shutdown(void) { rhltable_destroy(&nfs4_file_rhltable); shrinker_free(nfsd_slot_shrinker); } static void get_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) { if (HAS_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG) && CURRENT_STATEID(stateid)) memcpy(stateid, &cstate->current_stateid, sizeof(stateid_t)); } static void put_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) { if (cstate->minorversion) { memcpy(&cstate->current_stateid, stateid, sizeof(stateid_t)); SET_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG); } } void clear_current_stateid(struct nfsd4_compound_state *cstate) { CLEAR_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG); } /* * functions to set current state id */ void nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { put_stateid(cstate, &u->open_downgrade.od_stateid); } void nfsd4_set_openstateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { put_stateid(cstate, &u->open.op_stateid); } void nfsd4_set_closestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { put_stateid(cstate, &u->close.cl_stateid); } void nfsd4_set_lockstateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { put_stateid(cstate, &u->lock.lk_resp_stateid); } /* * functions to consume current state id */ void nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->open_downgrade.od_stateid); } void nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->delegreturn.dr_stateid); } void nfsd4_get_freestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->free_stateid.fr_stateid); } void nfsd4_get_setattrstateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->setattr.sa_stateid); } void nfsd4_get_closestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->close.cl_stateid); } void nfsd4_get_lockustateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->locku.lu_stateid); } void nfsd4_get_readstateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->read.rd_stateid); } void nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { get_stateid(cstate, &u->write.wr_stateid); } /** * set_cb_time - vet and set the timespec for a cb_getattr update * @cb: timestamp from the CB_GETATTR response * @orig: original timestamp in the inode * @now: current time * * Given a timestamp in a CB_GETATTR response, check it against the * current timestamp in the inode and the current time. Returns true * if the inode's timestamp needs to be updated, and false otherwise. * @cb may also be changed if the timestamp needs to be clamped. */ static bool set_cb_time(struct timespec64 *cb, const struct timespec64 *orig, const struct timespec64 *now) { /* * "When the time presented is before the original time, then the * update is ignored." Also no need to update if there is no change. */ if (timespec64_compare(cb, orig) <= 0) return false; /* * "When the time presented is in the future, the server can either * clamp the new time to the current time, or it may * return NFS4ERR_DELAY to the client, allowing it to retry." */ if (timespec64_compare(cb, now) > 0) { /* clamp it */ *cb = *now; } return true; } static int cb_getattr_update_times(struct dentry *dentry, struct nfs4_delegation *dp) { struct inode *inode = d_inode(dentry); struct timespec64 now = current_time(inode); struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr; struct iattr attrs = { }; int ret; if (deleg_attrs_deleg(dp->dl_type)) { struct timespec64 atime = inode_get_atime(inode); struct timespec64 mtime = inode_get_mtime(inode); attrs.ia_atime = ncf->ncf_cb_atime; attrs.ia_mtime = ncf->ncf_cb_mtime; if (set_cb_time(&attrs.ia_atime, &atime, &now)) attrs.ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; if (set_cb_time(&attrs.ia_mtime, &mtime, &now)) { attrs.ia_valid |= ATTR_CTIME | ATTR_MTIME | ATTR_MTIME_SET; attrs.ia_ctime = attrs.ia_mtime; } } else { attrs.ia_valid |= ATTR_MTIME | ATTR_CTIME; attrs.ia_mtime = attrs.ia_ctime = now; } if (!attrs.ia_valid) return 0; attrs.ia_valid |= ATTR_DELEG; inode_lock(inode); ret = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL); inode_unlock(inode); return ret; } /** * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict * @rqstp: RPC transaction context * @dentry: dentry of inode to be checked for a conflict * @pdp: returned WRITE delegation, if one was found * * This function is called when there is a conflict between a write * delegation and a change/size GETATTR from another client. The server * must either use the CB_GETATTR to get the current values of the * attributes from the client that holds the delegation or recall the * delegation before replying to the GETATTR. See RFC 8881 section * 18.7.4. * * Returns 0 if there is no conflict; otherwise an nfs_stat * code is returned. If @pdp is set to a non-NULL value, then the * caller must put the reference. */ __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_delegation **pdp) { __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file_lock_context *ctx; struct nfs4_delegation *dp = NULL; struct file_lease *fl; struct nfs4_cb_fattr *ncf; struct inode *inode = d_inode(dentry); ctx = locks_inode_context(inode); if (!ctx) return nfs_ok; #define NON_NFSD_LEASE ((void *)1) spin_lock(&ctx->flc_lock); for_each_file_lock(fl, &ctx->flc_lease) { if (fl->c.flc_flags == FL_LAYOUT) continue; if (fl->c.flc_type == F_WRLCK) { if (fl->fl_lmops == &nfsd_lease_mng_ops) dp = fl->c.flc_owner; else dp = NON_NFSD_LEASE; } break; } if (dp == NULL || dp == NON_NFSD_LEASE || dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { spin_unlock(&ctx->flc_lock); if (dp == NON_NFSD_LEASE) { status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (status != nfserr_jukebox || !nfsd_wait_for_delegreturn(rqstp, inode)) return status; } return 0; } nfsd_stats_wdeleg_getattr_inc(nn); refcount_inc(&dp->dl_stid.sc_count); ncf = &dp->dl_cb_fattr; nfs4_cb_getattr(&dp->dl_cb_fattr); spin_unlock(&ctx->flc_lock); wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY, TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT); if (ncf->ncf_cb_status) { /* Recall delegation only if client didn't respond */ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (status != nfserr_jukebox || !nfsd_wait_for_delegreturn(rqstp, inode)) goto out_status; } if (!ncf->ncf_file_modified && (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) ncf->ncf_file_modified = true; if (ncf->ncf_file_modified) { int err; /* * Per section 10.4.3 of RFC 8881, the server would * not update the file's metadata with the client's * modified size */ err = cb_getattr_update_times(dentry, dp); if (err) { status = nfserrno(err); goto out_status; } ncf->ncf_cur_fsize = ncf->ncf_cb_fsize; *pdp = dp; return nfs_ok; } status = nfs_ok; out_status: nfs4_put_stid(&dp->dl_stid); return status; }
171 171 171 171 10 10 10 10 10 10 10 10 64 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/percpu-refcount.h> /* * Initially, a percpu refcount is just a set of percpu counters. Initially, we * don't try to detect the ref hitting 0 - which means that get/put can just * increment or decrement the local counter. Note that the counter on a * particular cpu can (and will) wrap - this is fine, when we go to shutdown the * percpu counters will all sum to the correct value * * (More precisely: because modular arithmetic is commutative the sum of all the * percpu_count vars will be equal to what it would have been if all the gets * and puts were done to a single integer, even if some of the percpu integers * overflow or underflow). * * The real trick to implementing percpu refcounts is shutdown. We can't detect * the ref hitting 0 on every put - this would require global synchronization * and defeat the whole purpose of using percpu refs. * * What we do is require the user to keep track of the initial refcount; we know * the ref can't hit 0 before the user drops the initial ref, so as long as we * convert to non percpu mode before the initial ref is dropped everything * works. * * Converting to non percpu mode is done with some RCUish stuff in * percpu_ref_kill. Additionally, we need a bias value so that the * atomic_long_t can't hit 0 before we've added up all the percpu refs. */ #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) static DEFINE_SPINLOCK(percpu_ref_switch_lock); static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { return (unsigned long __percpu *) (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } /** * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 * @flags: PERCPU_REF_INIT_* flags * @gfp: allocation mask to use * * Initializes @ref. @ref starts out in percpu mode with a refcount of 1 unless * @flags contains PERCPU_REF_INIT_ATOMIC or PERCPU_REF_INIT_DEAD. These flags * change the start state to atomic with the latter setting the initial refcount * to 0. See the definitions of PERCPU_REF_INIT_* flags for flag behaviors. * * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, unsigned int flags, gfp_t gfp) { size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, __alignof__(unsigned long)); unsigned long start_count = 0; struct percpu_ref_data *data; ref->percpu_count_ptr = (unsigned long) __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); if (!ref->percpu_count_ptr) return -ENOMEM; data = kzalloc(sizeof(*ref->data), gfp); if (!data) { free_percpu((void __percpu *)ref->percpu_count_ptr); ref->percpu_count_ptr = 0; return -ENOMEM; } data->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; data->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT; if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) { ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; data->allow_reinit = true; } else { start_count += PERCPU_COUNT_BIAS; } if (flags & PERCPU_REF_INIT_DEAD) ref->percpu_count_ptr |= __PERCPU_REF_DEAD; else start_count++; atomic_long_set(&data->count, start_count); data->release = release; data->confirm_switch = NULL; data->ref = ref; ref->data = data; return 0; } EXPORT_SYMBOL_GPL(percpu_ref_init); static void __percpu_ref_exit(struct percpu_ref *ref) { unsigned long __percpu *percpu_count = percpu_count_ptr(ref); if (percpu_count) { /* non-NULL confirm_switch indicates switching in progress */ WARN_ON_ONCE(ref->data && ref->data->confirm_switch); free_percpu(percpu_count); ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; } } /** * percpu_ref_exit - undo percpu_ref_init() * @ref: percpu_ref to exit * * This function exits @ref. The caller is responsible for ensuring that * @ref is no longer in active use. The usual places to invoke this * function from are the @ref->release() callback or in init failure path * where percpu_ref_init() succeeded but other parts of the initialization * of the embedding object failed. */ void percpu_ref_exit(struct percpu_ref *ref) { struct percpu_ref_data *data = ref->data; unsigned long flags; __percpu_ref_exit(ref); if (!data) return; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->percpu_count_ptr |= atomic_long_read(&ref->data->count) << __PERCPU_REF_FLAG_BITS; ref->data = NULL; spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); kfree(data); } EXPORT_SYMBOL_GPL(percpu_ref_exit); static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu) { struct percpu_ref_data *data = container_of(rcu, struct percpu_ref_data, rcu); struct percpu_ref *ref = data->ref; data->confirm_switch(ref); data->confirm_switch = NULL; wake_up_all(&percpu_ref_switch_waitq); if (!data->allow_reinit) __percpu_ref_exit(ref); /* drop ref from percpu_ref_switch_to_atomic() */ percpu_ref_put(ref); } static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) { struct percpu_ref_data *data = container_of(rcu, struct percpu_ref_data, rcu); struct percpu_ref *ref = data->ref; unsigned long __percpu *percpu_count = percpu_count_ptr(ref); static atomic_t underflows; unsigned long count = 0; int cpu; for_each_possible_cpu(cpu) count += *per_cpu_ptr(percpu_count, cpu); pr_debug("global %lu percpu %lu\n", atomic_long_read(&data->count), count); /* * It's crucial that we sum the percpu counters _before_ adding the sum * to &ref->count; since gets could be happening on one cpu while puts * happen on another, adding a single cpu's count could cause * @ref->count to hit 0 before we've got a consistent value - but the * sum of all the counts will be consistent and correct. * * Subtracting the bias value then has to happen _after_ adding count to * &ref->count; we need the bias value to prevent &ref->count from * reaching 0 before we add the percpu counts. But doing it at the same * time is equivalent and saves us atomic operations: */ atomic_long_add((long)count - PERCPU_COUNT_BIAS, &data->count); if (WARN_ONCE(atomic_long_read(&data->count) <= 0, "percpu ref (%ps) <= 0 (%ld) after switching to atomic", data->release, atomic_long_read(&data->count)) && atomic_inc_return(&underflows) < 4) { pr_err("%s(): percpu_ref underflow", __func__); mem_dump_obj(data); } /* @ref is viewed as dead on all CPUs, send out switch confirmation */ percpu_ref_call_confirm_rcu(rcu); } static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) { } static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) { if (confirm_switch) confirm_switch(ref); return; } /* switching from percpu to atomic */ ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; /* * Non-NULL ->confirm_switch is used to indicate that switching is * in progress. Use noop one if unspecified. */ ref->data->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch; percpu_ref_get(ref); /* put after confirmation */ call_rcu_hurry(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu); } static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) { unsigned long __percpu *percpu_count = percpu_count_ptr(ref); int cpu; BUG_ON(!percpu_count); if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) return; if (WARN_ON_ONCE(!ref->data->allow_reinit)) return; atomic_long_add(PERCPU_COUNT_BIAS, &ref->data->count); /* * Restore per-cpu operation. smp_store_release() is paired * with READ_ONCE() in __ref_is_percpu() and guarantees that the * zeroing is visible to all percpu accesses which can see the * following __PERCPU_REF_ATOMIC clearing. */ for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; smp_store_release(&ref->percpu_count_ptr, ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } static void __percpu_ref_switch_mode(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { struct percpu_ref_data *data = ref->data; lockdep_assert_held(&percpu_ref_switch_lock); /* * If the previous ATOMIC switching hasn't finished yet, wait for * its completion. If the caller ensures that ATOMIC switching * isn't in progress, this function can be called from any context. */ wait_event_lock_irq(percpu_ref_switch_waitq, !data->confirm_switch, percpu_ref_switch_lock); if (data->force_atomic || percpu_ref_is_dying(ref)) __percpu_ref_switch_to_atomic(ref, confirm_switch); else __percpu_ref_switch_to_percpu(ref); } /** * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode * @ref: percpu_ref to switch to atomic mode * @confirm_switch: optional confirmation callback * * There's no reason to use this function for the usual reference counting. * Use percpu_ref_kill[_and_confirm](). * * Schedule switching of @ref to atomic mode. All its percpu counts will * be collected to the main atomic counter. On completion, when all CPUs * are guaraneed to be in atomic mode, @confirm_switch, which may not * block, is invoked. This function may be invoked concurrently with all * the get/put operations and can safely be mixed with kill and reinit * operations. Note that @ref will stay in atomic mode across kill/reinit * cycles until percpu_ref_switch_to_percpu() is called. * * This function may block if @ref is in the process of switching to atomic * mode. If the caller ensures that @ref is not in the process of * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->data->force_atomic = true; __percpu_ref_switch_mode(ref, confirm_switch); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic); /** * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode * @ref: percpu_ref to switch to atomic mode * * Schedule switching the ref to atomic mode, and wait for the * switch to complete. Caller must ensure that no other thread * will switch back to percpu mode. */ void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref) { percpu_ref_switch_to_atomic(ref, NULL); wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync); /** * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode * @ref: percpu_ref to switch to percpu mode * * There's no reason to use this function for the usual reference counting. * To re-use an expired ref, use percpu_ref_reinit(). * * Switch @ref to percpu mode. This function may be invoked concurrently * with all the get/put operations and can safely be mixed with kill and * reinit operations. This function reverses the sticky atomic state set * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is * dying or dead, the actual switching takes place on the following * percpu_ref_reinit(). * * This function may block if @ref is in the process of switching to atomic * mode. If the caller ensures that @ref is not in the process of * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->data->force_atomic = false; __percpu_ref_switch_mode(ref, NULL); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu); /** * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation * @ref: percpu_ref to kill * @confirm_kill: optional confirmation callback * * Equivalent to percpu_ref_kill() but also schedules kill confirmation if * @confirm_kill is not NULL. @confirm_kill, which may not block, will be * called after @ref is seen as dead from all CPUs at which point all * further invocations of percpu_ref_tryget_live() will fail. See * percpu_ref_tryget_live() for details. * * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the * process of switching to atomic mode by percpu_ref_switch_to_atomic(). * * There are no implied RCU grace periods between kill and release. */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); WARN_ONCE(percpu_ref_is_dying(ref), "%s called more than once on %ps!", __func__, ref->data->release); ref->percpu_count_ptr |= __PERCPU_REF_DEAD; __percpu_ref_switch_mode(ref, confirm_kill); percpu_ref_put(ref); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); /** * percpu_ref_is_zero - test whether a percpu refcount reached zero * @ref: percpu_ref to test * * Returns %true if @ref reached zero. * * This function is safe to call as long as @ref is between init and exit. */ bool percpu_ref_is_zero(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; unsigned long count, flags; if (__ref_is_percpu(ref, &percpu_count)) return false; /* protect us from being destroyed */ spin_lock_irqsave(&percpu_ref_switch_lock, flags); if (ref->data) count = atomic_long_read(&ref->data->count); else count = ref->percpu_count_ptr >> __PERCPU_REF_FLAG_BITS; spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); return count == 0; } EXPORT_SYMBOL_GPL(percpu_ref_is_zero); /** * percpu_ref_reinit - re-initialize a percpu refcount * @ref: perpcu_ref to re-initialize * * Re-initialize @ref so that it's in the same state as when it finished * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been * initialized successfully and reached 0 but not exited. * * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while * this function is in progress. */ void percpu_ref_reinit(struct percpu_ref *ref) { WARN_ON_ONCE(!percpu_ref_is_zero(ref)); percpu_ref_resurrect(ref); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); /** * percpu_ref_resurrect - modify a percpu refcount from dead to live * @ref: perpcu_ref to resurrect * * Modify @ref so that it's in the same state as before percpu_ref_kill() was * called. @ref must be dead but must not yet have exited. * * If @ref->release() frees @ref then the caller is responsible for * guaranteeing that @ref->release() does not get called while this * function is in progress. * * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while * this function is in progress. */ void percpu_ref_resurrect(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); WARN_ON_ONCE(!percpu_ref_is_dying(ref)); WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count)); ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); __percpu_ref_switch_mode(ref, NULL); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_resurrect);
2 270 235 105 229 143 143 106 106 103 269 286 7 7 7 7 289 1 285 7 289 7 7 289 1 3 1 1 114 112 9 115 114 248 294 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* First Come First Serve (a.k.a. FIFO) * RFC DRAFT ndata Section 3.1 */ static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid, __u16 value, gfp_t gfp) { return 0; } static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { *value = 0; return 0; } static int sctp_sched_fcfs_init(struct sctp_stream *stream) { return 0; } static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { return 0; } static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid) { } static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { } static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_chunk *ch = NULL; struct list_head *entry; if (list_empty(&q->out_chunk_list)) goto out; if (stream->out_curr) { ch = list_entry(stream->out_curr->ext->outq.next, struct sctp_chunk, stream_list); } else { entry = q->out_chunk_list.next; ch = list_entry(entry, struct sctp_chunk, list); } sctp_sched_dequeue_common(q, ch); out: return ch; } static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q, struct sctp_chunk *chunk) { } static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream) { } static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) { } static struct sctp_sched_ops sctp_sched_fcfs = { .set = sctp_sched_fcfs_set, .get = sctp_sched_fcfs_get, .init = sctp_sched_fcfs_init, .init_sid = sctp_sched_fcfs_init_sid, .free_sid = sctp_sched_fcfs_free_sid, .enqueue = sctp_sched_fcfs_enqueue, .dequeue = sctp_sched_fcfs_dequeue, .dequeue_done = sctp_sched_fcfs_dequeue_done, .sched_all = sctp_sched_fcfs_sched_all, .unsched_all = sctp_sched_fcfs_unsched_all, }; static void sctp_sched_ops_fcfs_init(void) { sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs); } /* API to other parts of the stack */ static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; void sctp_sched_ops_register(enum sctp_sched_type sched, struct sctp_sched_ops *sched_ops) { sctp_sched_ops[sched] = sched_ops; } void sctp_sched_ops_init(void) { sctp_sched_ops_fcfs_init(); sctp_sched_ops_prio_init(); sctp_sched_ops_rr_init(); sctp_sched_ops_fc_init(); sctp_sched_ops_wfq_init(); } static void sctp_sched_free_sched(struct sctp_stream *stream) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *soute; int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) { soute = SCTP_SO(stream, i)->ext; if (!soute) continue; sched->free_sid(stream, i); /* Give the next scheduler a clean slate. */ memset_after(soute, 0, outq); } } int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) { struct sctp_sched_ops *old = asoc->outqueue.sched; struct sctp_datamsg *msg = NULL; struct sctp_sched_ops *n; struct sctp_chunk *ch; int i, ret = 0; if (sched > SCTP_SS_MAX) return -EINVAL; n = sctp_sched_ops[sched]; if (old == n) return ret; if (old) sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = n; n->init(&asoc->stream); for (i = 0; i < asoc->stream.outcnt; i++) { if (!SCTP_SO(&asoc->stream, i)->ext) continue; ret = n->init_sid(&asoc->stream, i, GFP_ATOMIC); if (ret) goto err; } /* We have to requeue all chunks already queued. */ list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { if (ch->msg == msg) continue; msg = ch->msg; n->enqueue(&asoc->outqueue, msg); } return ret; err: sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ return ret; } int sctp_sched_get_sched(struct sctp_association *asoc) { int i; for (i = 0; i <= SCTP_SS_MAX; i++) if (asoc->outqueue.sched == sctp_sched_ops[i]) return i; return 0; } int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, __u16 value, gfp_t gfp) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) { int ret; ret = sctp_stream_init_ext(&asoc->stream, sid); if (ret) return ret; } return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp); } int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, __u16 *value) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) return 0; return asoc->outqueue.sched->get(&asoc->stream, sid, value); } void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { if (!list_is_last(&ch->frag_list, &ch->msg->chunks) && !q->asoc->peer.intl_capable) { struct sctp_stream_out *sout; __u16 sid; /* datamsg is not finish, so save it as current one, * in case application switch scheduler or a higher * priority stream comes in. */ sid = sctp_chunk_stream_no(ch); sout = SCTP_SO(&q->asoc->stream, sid); q->asoc->stream.out_curr = sout; return; } q->asoc->stream.out_curr = NULL; q->sched->dequeue_done(q, ch); } /* Auxiliary functions for the schedulers */ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) { list_del_init(&ch->list); list_del_init(&ch->stream_list); q->out_qlen -= ch->skb->len; } int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&ext->outq); return sched->init_sid(stream, sid, gfp); } struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); return asoc->outqueue.sched; }
243 243 243 243 243 243 243 243 243 243 242 243 243 243 243 243 243 243 243 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org> * * Fixes from William Schumacher incorporated on 15 March 2001. * (Reported by Charles Bertsch, <CBertsch@microtest.com>). */ /* * This file contains generic functions for manipulating * POSIX 1003.1e draft standard 17 ACLs. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/atomic.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/mnt_idmapping.h> #include <linux/iversion.h> #include <linux/security.h> #include <linux/fsnotify.h> #include <linux/filelock.h> #include "internal.h" static struct posix_acl **acl_by_type(struct inode *inode, int type) { switch (type) { case ACL_TYPE_ACCESS: return &inode->i_acl; case ACL_TYPE_DEFAULT: return &inode->i_default_acl; default: BUG(); } } struct posix_acl *get_cached_acl(struct inode *inode, int type) { struct posix_acl **p = acl_by_type(inode, type); struct posix_acl *acl; for (;;) { rcu_read_lock(); acl = rcu_dereference(*p); if (!acl || is_uncached_acl(acl) || refcount_inc_not_zero(&acl->a_refcount)) break; rcu_read_unlock(); cpu_relax(); } rcu_read_unlock(); return acl; } EXPORT_SYMBOL(get_cached_acl); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) { struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type)); if (acl == ACL_DONT_CACHE) { struct posix_acl *ret; ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU); if (!IS_ERR(ret)) acl = ret; } return acl; } EXPORT_SYMBOL(get_cached_acl_rcu); void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) { struct posix_acl **p = acl_by_type(inode, type); struct posix_acl *old; old = xchg(p, posix_acl_dup(acl)); if (!is_uncached_acl(old)) posix_acl_release(old); } EXPORT_SYMBOL(set_cached_acl); static void __forget_cached_acl(struct posix_acl **p) { struct posix_acl *old; old = xchg(p, ACL_NOT_CACHED); if (!is_uncached_acl(old)) posix_acl_release(old); } void forget_cached_acl(struct inode *inode, int type) { __forget_cached_acl(acl_by_type(inode, type)); } EXPORT_SYMBOL(forget_cached_acl); void forget_all_cached_acls(struct inode *inode) { __forget_cached_acl(&inode->i_acl); __forget_cached_acl(&inode->i_default_acl); } EXPORT_SYMBOL(forget_all_cached_acls); static struct posix_acl *__get_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, int type) { struct posix_acl *sentinel; struct posix_acl **p; struct posix_acl *acl; /* * The sentinel is used to detect when another operation like * set_cached_acl() or forget_cached_acl() races with get_inode_acl(). * It is guaranteed that is_uncached_acl(sentinel) is true. */ acl = get_cached_acl(inode, type); if (!is_uncached_acl(acl)) return acl; if (!IS_POSIXACL(inode)) return NULL; sentinel = uncached_acl_sentinel(current); p = acl_by_type(inode, type); /* * If the ACL isn't being read yet, set our sentinel. Otherwise, the * current value of the ACL will not be ACL_NOT_CACHED and so our own * sentinel will not be set; another task will update the cache. We * could wait for that other task to complete its job, but it's easier * to just call ->get_inode_acl to fetch the ACL ourself. (This is * going to be an unlikely race.) */ cmpxchg(p, ACL_NOT_CACHED, sentinel); /* * Normally, the ACL returned by ->get{_inode}_acl will be cached. * A filesystem can prevent that by calling * forget_cached_acl(inode, type) in ->get{_inode}_acl. * * If the filesystem doesn't have a get{_inode}_ acl() function at all, * we'll just create the negative cache entry. */ if (dentry && inode->i_op->get_acl) { acl = inode->i_op->get_acl(idmap, dentry, type); } else if (inode->i_op->get_inode_acl) { acl = inode->i_op->get_inode_acl(inode, type, false); } else { set_cached_acl(inode, type, NULL); return NULL; } if (IS_ERR(acl)) { /* * Remove our sentinel so that we don't block future attempts * to cache the ACL. */ cmpxchg(p, sentinel, ACL_NOT_CACHED); return acl; } /* * Cache the result, but only if our sentinel is still in place. */ posix_acl_dup(acl); if (unlikely(!try_cmpxchg(p, &sentinel, acl))) posix_acl_release(acl); return acl; } struct posix_acl *get_inode_acl(struct inode *inode, int type) { return __get_acl(&nop_mnt_idmap, NULL, inode, type); } EXPORT_SYMBOL(get_inode_acl); /* * Init a fresh posix_acl */ void posix_acl_init(struct posix_acl *acl, int count) { refcount_set(&acl->a_refcount, 1); acl->a_count = count; } EXPORT_SYMBOL(posix_acl_init); /* * Allocate a new ACL with the specified number of entries. */ struct posix_acl * posix_acl_alloc(unsigned int count, gfp_t flags) { struct posix_acl *acl; acl = kmalloc(struct_size(acl, a_entries, count), flags); if (acl) posix_acl_init(acl, count); return acl; } EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. */ struct posix_acl * posix_acl_clone(const struct posix_acl *acl, gfp_t flags) { struct posix_acl *clone = NULL; if (acl) { clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), flags); if (clone) refcount_set(&clone->a_refcount, 1); } return clone; } EXPORT_SYMBOL_GPL(posix_acl_clone); /* * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. */ int posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; int needs_mask = 0; FOREACH_ACL_ENTRY(pa, acl, pe) { if (pa->e_perm & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE)) return -EINVAL; switch (pa->e_tag) { case ACL_USER_OBJ: if (state == ACL_USER_OBJ) { state = ACL_USER; break; } return -EINVAL; case ACL_USER: if (state != ACL_USER) return -EINVAL; if (!kuid_has_mapping(user_ns, pa->e_uid)) return -EINVAL; needs_mask = 1; break; case ACL_GROUP_OBJ: if (state == ACL_USER) { state = ACL_GROUP; break; } return -EINVAL; case ACL_GROUP: if (state != ACL_GROUP) return -EINVAL; if (!kgid_has_mapping(user_ns, pa->e_gid)) return -EINVAL; needs_mask = 1; break; case ACL_MASK: if (state != ACL_GROUP) return -EINVAL; state = ACL_OTHER; break; case ACL_OTHER: if (state == ACL_OTHER || (state == ACL_GROUP && !needs_mask)) { state = 0; break; } return -EINVAL; default: return -EINVAL; } } if (state == 0) return 0; return -EINVAL; } EXPORT_SYMBOL(posix_acl_valid); /* * Returns 0 if the acl can be exactly represented in the traditional * file mode permission bits, or else 1. Returns -E... on error. */ int posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) { const struct posix_acl_entry *pa, *pe; umode_t mode = 0; int not_equiv = 0; /* * A null ACL can always be presented as mode bits. */ if (!acl) return 0; FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: mode |= (pa->e_perm & S_IRWXO) << 6; break; case ACL_GROUP_OBJ: mode |= (pa->e_perm & S_IRWXO) << 3; break; case ACL_OTHER: mode |= pa->e_perm & S_IRWXO; break; case ACL_MASK: mode = (mode & ~S_IRWXG) | ((pa->e_perm & S_IRWXO) << 3); not_equiv = 1; break; case ACL_USER: case ACL_GROUP: not_equiv = 1; break; default: return -EINVAL; } } if (mode_p) *mode_p = (*mode_p & ~S_IRWXUGO) | mode; return not_equiv; } EXPORT_SYMBOL(posix_acl_equiv_mode); /* * Create an ACL representing the file mode permission bits of an inode. */ struct posix_acl * posix_acl_from_mode(umode_t mode, gfp_t flags) { struct posix_acl *acl = posix_acl_alloc(3, flags); if (!acl) return ERR_PTR(-ENOMEM); acl->a_entries[0].e_tag = ACL_USER_OBJ; acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6; acl->a_entries[1].e_tag = ACL_GROUP_OBJ; acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3; acl->a_entries[2].e_tag = ACL_OTHER; acl->a_entries[2].e_perm = (mode & S_IRWXO); return acl; } EXPORT_SYMBOL(posix_acl_from_mode); /* * Return 0 if current is granted want access to the inode * by the acl. Returns -E... otherwise. */ int posix_acl_permission(struct mnt_idmap *idmap, struct inode *inode, const struct posix_acl *acl, int want) { const struct posix_acl_entry *pa, *pe, *mask_obj; struct user_namespace *fs_userns = i_user_ns(inode); int found = 0; vfsuid_t vfsuid; vfsgid_t vfsgid; want &= MAY_READ | MAY_WRITE | MAY_EXEC; FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto check_perm; break; case ACL_USER: vfsuid = make_vfsuid(idmap, fs_userns, pa->e_uid); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_GROUP: vfsgid = make_vfsgid(idmap, fs_userns, pa->e_gid); if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_MASK: break; case ACL_OTHER: if (found) return -EACCES; else goto check_perm; default: return -EIO; } } return -EIO; mask: for (mask_obj = pa+1; mask_obj != pe; mask_obj++) { if (mask_obj->e_tag == ACL_MASK) { if ((pa->e_perm & mask_obj->e_perm & want) == want) return 0; return -EACCES; } } check_perm: if ((pa->e_perm & want) == want) return 0; return -EACCES; } /* * Modify acl when creating a new inode. The caller must ensure the acl is * only referenced once. * * mode_p initially must contain the mode parameter to the open() / creat() * system calls. All permissions that are not granted by the acl are removed. * The permissions in the acl are changed to reflect the mode_p parameter. */ static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) { struct posix_acl_entry *pa, *pe; struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; umode_t mode = *mode_p; int not_equiv = 0; /* assert(atomic_read(acl->a_refcount) == 1); */ FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: pa->e_perm &= (mode >> 6) | ~S_IRWXO; mode &= (pa->e_perm << 6) | ~S_IRWXU; break; case ACL_USER: case ACL_GROUP: not_equiv = 1; break; case ACL_GROUP_OBJ: group_obj = pa; break; case ACL_OTHER: pa->e_perm &= mode | ~S_IRWXO; mode &= pa->e_perm | ~S_IRWXO; break; case ACL_MASK: mask_obj = pa; not_equiv = 1; break; default: return -EIO; } } if (mask_obj) { mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO; mode &= (mask_obj->e_perm << 3) | ~S_IRWXG; } else { if (!group_obj) return -EIO; group_obj->e_perm &= (mode >> 3) | ~S_IRWXO; mode &= (group_obj->e_perm << 3) | ~S_IRWXG; } *mode_p = (*mode_p & ~S_IRWXUGO) | mode; return not_equiv; } /* * Modify the ACL for the chmod syscall. */ static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) { struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; struct posix_acl_entry *pa, *pe; /* assert(atomic_read(acl->a_refcount) == 1); */ FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: pa->e_perm = (mode & S_IRWXU) >> 6; break; case ACL_USER: case ACL_GROUP: break; case ACL_GROUP_OBJ: group_obj = pa; break; case ACL_MASK: mask_obj = pa; break; case ACL_OTHER: pa->e_perm = (mode & S_IRWXO); break; default: return -EIO; } } if (mask_obj) { mask_obj->e_perm = (mode & S_IRWXG) >> 3; } else { if (!group_obj) return -EIO; group_obj->e_perm = (mode & S_IRWXG) >> 3; } return 0; } int __posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; if (clone) { err = posix_acl_create_masq(clone, mode_p); if (err < 0) { posix_acl_release(clone); clone = NULL; } } posix_acl_release(*acl); *acl = clone; return err; } EXPORT_SYMBOL(__posix_acl_create); int __posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; if (clone) { err = __posix_acl_chmod_masq(clone, mode); if (err) { posix_acl_release(clone); clone = NULL; } } posix_acl_release(*acl); *acl = clone; return err; } EXPORT_SYMBOL(__posix_acl_chmod); /** * posix_acl_chmod - chmod a posix acl * * @idmap: idmap of the mount @inode was found from * @dentry: dentry to check permissions on * @mode: the new mode of @inode * * If the dentry has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { struct inode *inode = d_inode(dentry); struct posix_acl *acl; int ret = 0; if (!IS_POSIXACL(inode)) return 0; if (!inode->i_op->set_acl) return -EOPNOTSUPP; acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR_OR_NULL(acl)) { if (acl == ERR_PTR(-EOPNOTSUPP)) return 0; return PTR_ERR(acl); } ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) return ret; ret = inode->i_op->set_acl(idmap, dentry, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); return ret; } EXPORT_SYMBOL(posix_acl_chmod); int posix_acl_create(struct inode *dir, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { struct posix_acl *p; struct posix_acl *clone; int ret; *acl = NULL; *default_acl = NULL; if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) return 0; p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (!p || p == ERR_PTR(-EOPNOTSUPP)) { *mode &= ~current_umask(); return 0; } if (IS_ERR(p)) return PTR_ERR(p); ret = -ENOMEM; clone = posix_acl_clone(p, GFP_NOFS); if (!clone) goto err_release; ret = posix_acl_create_masq(clone, mode); if (ret < 0) goto err_release_clone; if (ret == 0) posix_acl_release(clone); else *acl = clone; if (!S_ISDIR(*mode)) posix_acl_release(p); else *default_acl = p; return 0; err_release_clone: posix_acl_release(clone); err_release: posix_acl_release(p); return ret; } EXPORT_SYMBOL_GPL(posix_acl_create); /** * posix_acl_update_mode - update mode in set_acl * @idmap: idmap of the mount @inode was found from * @inode: target inode * @mode_p: mode (pointer) for update * @acl: acl pointer * * Update the file mode when setting an ACL: compute the new file permission * bits based on the ACL. In addition, if the ACL is equivalent to the new * file mode, set *@acl to NULL to indicate that no ACL should be set. * * As with chmod, clear the setgid bit if the caller is not in the owning group * or capable of CAP_FSETID (see inode_change_ok). * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * Called from set_acl inode operations. */ int posix_acl_update_mode(struct mnt_idmap *idmap, struct inode *inode, umode_t *mode_p, struct posix_acl **acl) { umode_t mode = inode->i_mode; int error; error = posix_acl_equiv_mode(*acl, &mode); if (error < 0) return error; if (error == 0) *acl = NULL; if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; *mode_p = mode; return 0; } EXPORT_SYMBOL(posix_acl_update_mode); /* * Fix up the uids and gids in posix acl extended attributes in place. */ static int posix_acl_fix_xattr_common(const void *value, size_t size) { const struct posix_acl_xattr_header *header = value; int count; if (!header) return -EINVAL; if (size < sizeof(struct posix_acl_xattr_header)) return -EINVAL; if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) return -EOPNOTSUPP; count = posix_acl_xattr_count(size); if (count < 0) return -EINVAL; if (count == 0) return 0; return count; } /** * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format * @userns: the filesystem's idmapping * @value: the uapi representation of POSIX ACLs * @size: the size of @void * * Filesystems that store POSIX ACLs in the unaltered uapi format should use * posix_acl_from_xattr() when reading them from the backing store and * converting them into the struct posix_acl VFS format. The helper is * specifically intended to be called from the acl inode operation. * * The posix_acl_from_xattr() function will map the raw {g,u}id values stored * in ACL_{GROUP,USER} entries into idmapping in @userns. * * Note that posix_acl_from_xattr() does not take idmapped mounts into account. * If it did it calling it from the get acl inode operation would return POSIX * ACLs mapped according to an idmapped mount which would mean that the value * couldn't be cached for the filesystem. Idmapped mounts are taken into * account on the fly during permission checking or right at the VFS - * userspace boundary before reporting them to the user. * * Return: Allocated struct posix_acl on success, NULL for a valid header but * without actual POSIX ACL entries, or ERR_PTR() encoded error code. */ struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns, const void *value, size_t size) { const struct posix_acl_xattr_header *header = value; const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end; int count; struct posix_acl *acl; struct posix_acl_entry *acl_e; count = posix_acl_fix_xattr_common(value, size); if (count < 0) return ERR_PTR(count); if (count == 0) return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); acl_e = acl->a_entries; for (end = entry + count; entry != end; acl_e++, entry++) { acl_e->e_tag = le16_to_cpu(entry->e_tag); acl_e->e_perm = le16_to_cpu(entry->e_perm); switch(acl_e->e_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: break; case ACL_USER: acl_e->e_uid = make_kuid(userns, le32_to_cpu(entry->e_id)); if (!uid_valid(acl_e->e_uid)) goto fail; break; case ACL_GROUP: acl_e->e_gid = make_kgid(userns, le32_to_cpu(entry->e_id)); if (!gid_valid(acl_e->e_gid)) goto fail; break; default: goto fail; } } return acl; fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } EXPORT_SYMBOL (posix_acl_from_xattr); /* * Convert from in-memory to extended attribute representation. */ int posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, void *buffer, size_t size) { struct posix_acl_xattr_header *ext_acl = buffer; struct posix_acl_xattr_entry *ext_entry; int real_size, n; real_size = posix_acl_xattr_size(acl->a_count); if (!buffer) return real_size; if (real_size > size) return -ERANGE; ext_entry = (void *)(ext_acl + 1); ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); for (n=0; n < acl->a_count; n++, ext_entry++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); switch(acl_e->e_tag) { case ACL_USER: ext_entry->e_id = cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); break; case ACL_GROUP: ext_entry->e_id = cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); break; default: ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); break; } } return real_size; } EXPORT_SYMBOL (posix_acl_to_xattr); /** * vfs_posix_acl_to_xattr - convert from kernel to userspace representation * @idmap: idmap of the mount * @inode: inode the posix acls are set on * @acl: the posix acls as represented by the vfs * @buffer: the buffer into which to convert @acl * @size: size of @buffer * * This converts @acl from the VFS representation in the filesystem idmapping * to the uapi form reportable to userspace. And mount and caller idmappings * are handled appropriately. * * Return: On success, the size of the stored uapi posix acls, on error a * negative errno. */ static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, struct inode *inode, const struct posix_acl *acl, void *buffer, size_t size) { struct posix_acl_xattr_header *ext_acl = buffer; struct posix_acl_xattr_entry *ext_entry; struct user_namespace *fs_userns, *caller_userns; ssize_t real_size, n; vfsuid_t vfsuid; vfsgid_t vfsgid; real_size = posix_acl_xattr_size(acl->a_count); if (!buffer) return real_size; if (real_size > size) return -ERANGE; ext_entry = (void *)(ext_acl + 1); ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); fs_userns = i_user_ns(inode); caller_userns = current_user_ns(); for (n=0; n < acl->a_count; n++, ext_entry++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); switch(acl_e->e_tag) { case ACL_USER: vfsuid = make_vfsuid(idmap, fs_userns, acl_e->e_uid); ext_entry->e_id = cpu_to_le32(from_kuid( caller_userns, vfsuid_into_kuid(vfsuid))); break; case ACL_GROUP: vfsgid = make_vfsgid(idmap, fs_userns, acl_e->e_gid); ext_entry->e_id = cpu_to_le32(from_kgid( caller_userns, vfsgid_into_kgid(vfsgid))); break; default: ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); break; } } return real_size; } int set_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type, struct posix_acl *acl) { struct inode *inode = d_inode(dentry); if (!IS_POSIXACL(inode)) return -EOPNOTSUPP; if (!inode->i_op->set_acl) return -EOPNOTSUPP; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (acl) { int ret = posix_acl_valid(inode->i_sb->s_user_ns, acl); if (ret) return ret; } return inode->i_op->set_acl(idmap, dentry, acl, type); } EXPORT_SYMBOL(set_posix_acl); int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size) { int err; if (!IS_POSIXACL(inode)) return 0; if (inode->i_acl) { err = xattr_list_one(buffer, remaining_size, XATTR_NAME_POSIX_ACL_ACCESS); if (err) return err; } if (inode->i_default_acl) { err = xattr_list_one(buffer, remaining_size, XATTR_NAME_POSIX_ACL_DEFAULT); if (err) return err; } return 0; } static bool posix_acl_xattr_list(struct dentry *dentry) { return IS_POSIXACL(d_backing_inode(dentry)); } /* * nop_posix_acl_access - legacy xattr handler for access POSIX ACLs * * This is the legacy POSIX ACL access xattr handler. It is used by some * filesystems to implement their ->listxattr() inode operation. New code * should never use them. */ const struct xattr_handler nop_posix_acl_access = { .name = XATTR_NAME_POSIX_ACL_ACCESS, .list = posix_acl_xattr_list, }; EXPORT_SYMBOL_GPL(nop_posix_acl_access); /* * nop_posix_acl_default - legacy xattr handler for default POSIX ACLs * * This is the legacy POSIX ACL default xattr handler. It is used by some * filesystems to implement their ->listxattr() inode operation. New code * should never use them. */ const struct xattr_handler nop_posix_acl_default = { .name = XATTR_NAME_POSIX_ACL_DEFAULT, .list = posix_acl_xattr_list, }; EXPORT_SYMBOL_GPL(nop_posix_acl_default); int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error; struct inode *inode = d_inode(dentry); if (type == ACL_TYPE_ACCESS) { error = posix_acl_update_mode(idmap, inode, &inode->i_mode, &acl); if (error) return error; } inode_set_ctime_current(inode); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); set_cached_acl(inode, type, acl); return 0; } int simple_acl_create(struct inode *dir, struct inode *inode) { struct posix_acl *default_acl, *acl; int error; error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl); set_cached_acl(inode, ACL_TYPE_ACCESS, acl); if (default_acl) posix_acl_release(default_acl); if (acl) posix_acl_release(acl); return 0; } static int vfs_set_acl_idmapped_mnt(struct mnt_idmap *idmap, struct user_namespace *fs_userns, struct posix_acl *acl) { for (int n = 0; n < acl->a_count; n++) { struct posix_acl_entry *acl_e = &acl->a_entries[n]; switch (acl_e->e_tag) { case ACL_USER: acl_e->e_uid = from_vfsuid(idmap, fs_userns, VFSUIDT_INIT(acl_e->e_uid)); break; case ACL_GROUP: acl_e->e_gid = from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(acl_e->e_gid)); break; } } return 0; } /** * vfs_set_acl - set posix acls * @idmap: idmap of the mount * @dentry: the dentry based on which to set the posix acls * @acl_name: the name of the posix acl * @kacl: the posix acls in the appropriate VFS format * * This function sets @kacl. The caller must all posix_acl_release() on @kacl * afterwards. * * Return: On success 0, on error negative errno. */ int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { int acl_type; int error; struct inode *inode = d_inode(dentry); struct inode *delegated_inode = NULL; acl_type = posix_acl_type(acl_name); if (acl_type < 0) return -EINVAL; if (kacl) { /* * If we're on an idmapped mount translate from mount specific * vfs{g,u}id_t into global filesystem k{g,u}id_t. * Afterwards we can cache the POSIX ACLs filesystem wide and - * if this is a filesystem with a backing store - ultimately * translate them to backing store values. */ error = vfs_set_acl_idmapped_mnt(idmap, i_user_ns(inode), kacl); if (error) return error; } retry_deleg: inode_lock(inode); /* * We only care about restrictions the inode struct itself places upon * us otherwise POSIX ACLs aren't subject to any VFS restrictions. */ error = may_write_xattr(idmap, inode); if (error) goto out_inode_unlock; error = security_inode_set_acl(idmap, dentry, acl_name, kacl); if (error) goto out_inode_unlock; error = try_break_deleg(inode, &delegated_inode); if (error) goto out_inode_unlock; if (likely(!is_bad_inode(inode))) error = set_posix_acl(idmap, dentry, acl_type, kacl); else error = -EIO; if (!error) { fsnotify_xattr(dentry); security_inode_post_set_acl(dentry, acl_name, kacl); } out_inode_unlock: inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_set_acl); /** * vfs_get_acl - get posix acls * @idmap: idmap of the mount * @dentry: the dentry based on which to retrieve the posix acls * @acl_name: the name of the posix acl * * This function retrieves @kacl from the filesystem. The caller must all * posix_acl_release() on @kacl. * * Return: On success POSIX ACLs in VFS format, on error negative errno. */ struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { struct inode *inode = d_inode(dentry); struct posix_acl *acl; int acl_type, error; acl_type = posix_acl_type(acl_name); if (acl_type < 0) return ERR_PTR(-EINVAL); /* * The VFS has no restrictions on reading POSIX ACLs so calling * something like xattr_permission() isn't needed. Only LSMs get a say. */ error = security_inode_get_acl(idmap, dentry, acl_name); if (error) return ERR_PTR(error); if (!IS_POSIXACL(inode)) return ERR_PTR(-EOPNOTSUPP); if (S_ISLNK(inode->i_mode)) return ERR_PTR(-EOPNOTSUPP); acl = __get_acl(idmap, dentry, inode, acl_type); if (IS_ERR(acl)) return acl; if (!acl) return ERR_PTR(-ENODATA); return acl; } EXPORT_SYMBOL_GPL(vfs_get_acl); /** * vfs_remove_acl - remove posix acls * @idmap: idmap of the mount * @dentry: the dentry based on which to retrieve the posix acls * @acl_name: the name of the posix acl * * This function removes posix acls. * * Return: On success 0, on error negative errno. */ int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { int acl_type; int error; struct inode *inode = d_inode(dentry); struct inode *delegated_inode = NULL; acl_type = posix_acl_type(acl_name); if (acl_type < 0) return -EINVAL; retry_deleg: inode_lock(inode); /* * We only care about restrictions the inode struct itself places upon * us otherwise POSIX ACLs aren't subject to any VFS restrictions. */ error = may_write_xattr(idmap, inode); if (error) goto out_inode_unlock; error = security_inode_remove_acl(idmap, dentry, acl_name); if (error) goto out_inode_unlock; error = try_break_deleg(inode, &delegated_inode); if (error) goto out_inode_unlock; if (likely(!is_bad_inode(inode))) error = set_posix_acl(idmap, dentry, acl_type, NULL); else error = -EIO; if (!error) { fsnotify_xattr(dentry); security_inode_post_remove_acl(idmap, dentry, acl_name); } out_inode_unlock: inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_remove_acl); int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size) { int error; struct posix_acl *acl = NULL; if (size) { /* * Note that posix_acl_from_xattr() uses GFP_NOFS when it * probably doesn't need to here. */ acl = posix_acl_from_xattr(current_user_ns(), kvalue, size); if (IS_ERR(acl)) return PTR_ERR(acl); } error = vfs_set_acl(idmap, dentry, acl_name, acl); posix_acl_release(acl); return error; } ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size) { ssize_t error; struct posix_acl *acl; acl = vfs_get_acl(idmap, dentry, acl_name); if (IS_ERR(acl)) return PTR_ERR(acl); error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry), acl, kvalue, size); posix_acl_release(acl); return error; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 /* SPDX-License-Identifier: GPL-2.0 */ /* Interface for implementing AF_XDP zero-copy support in drivers. * Copyright(c) 2020 Intel Corporation. */ #ifndef _LINUX_XDP_SOCK_DRV_H #define _LINUX_XDP_SOCK_DRV_H #include <net/xdp_sock.h> #include <net/xsk_buff_pool.h> #define XDP_UMEM_MIN_CHUNK_SHIFT 11 #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) struct xsk_cb_desc { void *src; u8 off; u8 bytes; }; #ifdef CONFIG_XDP_SOCKETS void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool); bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool); static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { return XDP_PACKET_HEADROOM + pool->headroom; } static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return pool->chunk_size; } static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { xp_set_rxq_info(pool, rxq); } static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc) { xp_fill_cb(pool, desc); } static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { xp_dma_unmap(pool, attrs); } static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs) { struct xdp_umem *umem = pool->umem; return xp_dma_map(pool, dev, attrs, umem->pgs, umem->npgs); } static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); return xp_get_dma(xskb); } static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); return xp_get_frame_dma(xskb); } static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { return xp_alloc(pool); } static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return !xp_mb_desc(desc); } /* Returns as many entries as possible up to max. 0 <= N <= max. */ static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { return xp_alloc_batch(pool, xdp, max); } static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return xp_can_alloc(pool, count); } static inline void xsk_buff_free(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); struct list_head *xskb_list = &xskb->pool->xskb_list; struct xdp_buff_xsk *pos, *tmp; if (likely(!xdp_buff_has_frags(xdp))) goto out; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { list_del(&pos->list_node); xp_free(pos); } xdp_get_shared_info_from_buff(xdp)->nr_frags = 0; out: xp_free(xskb); } static inline bool xsk_buff_add_frag(struct xdp_buff *head, struct xdp_buff *xdp) { const void *data = xdp->data; struct xdp_buff_xsk *frag; if (!__xdp_buff_add_frag(head, virt_to_netmem(data), offset_in_page(data), xdp->data_end - data, xdp->frame_sz, false)) return false; frag = container_of(xdp, struct xdp_buff_xsk, xdp); list_add_tail(&frag->list_node, &frag->pool->xskb_list); return true; } static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff *ret = NULL; struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, struct xdp_buff_xsk, list_node); if (frag) { list_del(&frag->list_node); ret = &frag->xdp; } return ret; } static inline void xsk_buff_del_tail(struct xdp_buff *tail) { struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); list_del(&xskb->list_node); } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, list_node); return &frag->xdp; } static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; xdp->data_meta = xdp->data; xdp->data_end = xdp->data + size; xdp->flags = 0; } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) { return xp_raw_get_dma(pool, addr); } static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { return xp_raw_get_data(pool, addr); } #define XDP_TXMD_FLAGS_VALID ( \ XDP_TXMD_FLAGS_TIMESTAMP | \ XDP_TXMD_FLAGS_CHECKSUM | \ 0) static inline bool xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) { return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); } static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { struct xsk_tx_metadata *meta; if (!pool->tx_metadata_len) return NULL; meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) return NULL; /* no way to signal the error to the user */ return meta; } static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); xp_dma_sync_for_cpu(xskb); } static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { xp_dma_sync_for_device(pool, dma, size); } #else static inline void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { } static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { return false; } static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max) { return 0; } static inline void xsk_tx_release(struct xsk_buff_pool *pool) { } static inline struct xsk_buff_pool * xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { return NULL; } static inline void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { } static inline bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { return false; } static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { return 0; } static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return 0; } static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return 0; } static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { } static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc) { } static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { } static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs) { return 0; } static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) { return 0; } static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) { return 0; } static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { return NULL; } static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return false; } static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { return 0; } static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return false; } static inline void xsk_buff_free(struct xdp_buff *xdp) { } static inline bool xsk_buff_add_frag(struct xdp_buff *head, struct xdp_buff *xdp) { return false; } static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { return NULL; } static inline void xsk_buff_del_tail(struct xdp_buff *tail) { } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { return NULL; } static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) { return 0; } static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { return NULL; } static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) { return false; } static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { return NULL; } static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { } static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { } #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_DRV_H */
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #ifndef _NET_BATMAN_ADV_MAIN_H_ #define _NET_BATMAN_ADV_MAIN_H_ #define BATADV_DRIVER_AUTHOR "Marek Lindner <marek.lindner@mailbox.org>, " \ "Simon Wunderlich <sw@simonwunderlich.de>" #define BATADV_DRIVER_DESC "B.A.T.M.A.N. advanced" #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION #define BATADV_SOURCE_VERSION "2025.0" #endif /* B.A.T.M.A.N. parameters */ #define BATADV_TQ_MAX_VALUE 255 #define BATADV_THROUGHPUT_MAX_VALUE 0xFFFFFFFF #define BATADV_JITTER 20 /* Time To Live of broadcast messages */ #define BATADV_TTL 50 /* maximum sequence number age of broadcast messages */ #define BATADV_BCAST_MAX_AGE 64 /* purge originators after time in seconds if no valid packet comes in * -> TODO: check influence on BATADV_TQ_LOCAL_WINDOW_SIZE */ #define BATADV_PURGE_TIMEOUT 200000 /* 200 seconds */ #define BATADV_TT_LOCAL_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_CLIENT_ROAM_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */ #define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */ #define BATADV_MCAST_WORK_PERIOD 500 /* 0.5 seconds */ #define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */ /* sliding packet range of received originator messages in sequence numbers * (should be a multiple of our word size) */ #define BATADV_TQ_LOCAL_WINDOW_SIZE 64 /* milliseconds we have to keep pending tt_req */ #define BATADV_TT_REQUEST_TIMEOUT 3000 #define BATADV_TQ_GLOBAL_WINDOW_SIZE 5 #define BATADV_TQ_LOCAL_BIDRECT_SEND_MINIMUM 1 #define BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM 1 #define BATADV_TQ_TOTAL_BIDRECT_LIMIT 1 /* B.A.T.M.A.N. V */ #define BATADV_THROUGHPUT_DEFAULT_VALUE 10 /* 1 Mbps */ #define BATADV_ELP_PROBES_PER_NODE 2 #define BATADV_ELP_MIN_PROBE_SIZE 200 /* bytes */ #define BATADV_ELP_PROBE_MAX_TX_DIFF 100 /* milliseconds */ #define BATADV_ELP_MAX_AGE 64 #define BATADV_OGM_MAX_ORIGDIFF 5 #define BATADV_OGM_MAX_AGE 64 /* number of OGMs sent with the last tt diff */ #define BATADV_TT_OGM_APPEND_MAX 3 /* Time in which a client can roam at most ROAMING_MAX_COUNT times in * milliseconds */ #define BATADV_ROAMING_MAX_TIME 20000 #define BATADV_ROAMING_MAX_COUNT 5 #define BATADV_NO_FLAGS 0 #define BATADV_NULL_IFINDEX 0 /* dummy ifindex used to avoid iface checks */ #define BATADV_NO_MARK 0 /* default interface for multi interface operation. The default interface is * used for communication which originated locally (i.e. is not forwarded) * or where special forwarding is not desired/necessary. */ #define BATADV_IF_DEFAULT ((struct batadv_hard_iface *)NULL) #define BATADV_NUM_WORDS BITS_TO_LONGS(BATADV_TQ_LOCAL_WINDOW_SIZE) #define BATADV_LOG_BUF_LEN 8192 /* has to be a power of 2 */ /* number of packets to send for broadcasts on different interface types */ #define BATADV_NUM_BCASTS_DEFAULT 1 #define BATADV_NUM_BCASTS_WIRELESS 3 /* length of the single packet used by the TP meter */ #define BATADV_TP_PACKET_LEN ETH_DATA_LEN /* msecs after which an ARP_REQUEST is sent in broadcast as fallback */ #define ARP_REQ_DELAY 250 /* numbers of originator to contact for any PUT/GET DHT operation */ #define BATADV_DAT_CANDIDATES_NUM 3 /* BATADV_TQ_SIMILARITY_THRESHOLD - TQ points that a secondary metric can differ * at most from the primary one in order to be still considered acceptable */ #define BATADV_TQ_SIMILARITY_THRESHOLD 50 /* should not be bigger than 512 bytes or change the size of * forw_packet->direct_link_flags */ #define BATADV_MAX_AGGREGATION_BYTES 512 #define BATADV_MAX_AGGREGATION_MS 100 #define BATADV_BLA_PERIOD_LENGTH 10000 /* 10 seconds */ #define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 6) #define BATADV_BLA_CLAIM_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 10) #define BATADV_BLA_WAIT_PERIODS 3 #define BATADV_BLA_LOOPDETECT_PERIODS 6 #define BATADV_BLA_LOOPDETECT_TIMEOUT 3000 /* 3 seconds */ #define BATADV_DUPLIST_SIZE 16 #define BATADV_DUPLIST_TIMEOUT 500 /* 500 ms */ /* don't reset again within 30 seconds */ #define BATADV_RESET_PROTECTION_MS 30000 #define BATADV_EXPECTED_SEQNO_RANGE 65536 #define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */ /** * BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions */ #define BATADV_TP_MAX_NUM 5 /** * enum batadv_mesh_state - State of a soft interface */ enum batadv_mesh_state { /** @BATADV_MESH_INACTIVE: soft interface is not yet running */ BATADV_MESH_INACTIVE, /** @BATADV_MESH_ACTIVE: interface is up and running */ BATADV_MESH_ACTIVE, /** @BATADV_MESH_DEACTIVATING: interface is getting shut down */ BATADV_MESH_DEACTIVATING, }; #define BATADV_BCAST_QUEUE_LEN 256 #define BATADV_BATMAN_QUEUE_LEN 256 /** * enum batadv_uev_action - action type of uevent */ enum batadv_uev_action { /** @BATADV_UEV_ADD: gateway was selected (after none was selected) */ BATADV_UEV_ADD = 0, /** * @BATADV_UEV_DEL: selected gateway was removed and none is selected * anymore */ BATADV_UEV_DEL, /** * @BATADV_UEV_CHANGE: a different gateway was selected as based gateway */ BATADV_UEV_CHANGE, /** * @BATADV_UEV_LOOPDETECT: loop was detected which cannot be handled by * bridge loop avoidance */ BATADV_UEV_LOOPDETECT, }; /** * enum batadv_uev_type - Type of uevent */ enum batadv_uev_type { /** @BATADV_UEV_GW: selected gateway was modified */ BATADV_UEV_GW = 0, /** @BATADV_UEV_BLA: bridge loop avoidance event */ BATADV_UEV_BLA, }; #define BATADV_GW_THRESHOLD 50 /* Number of fragment chains for each orig_node */ #define BATADV_FRAG_BUFFER_COUNT 8 /* Maximum number of fragments for one packet */ #define BATADV_FRAG_MAX_FRAGMENTS 16 /* Maxumim size of each fragment */ #define BATADV_FRAG_MAX_FRAG_SIZE 1280 /* Time to keep fragments while waiting for rest of the fragments */ #define BATADV_FRAG_TIMEOUT 10000 #define BATADV_DAT_CANDIDATE_NOT_FOUND 0 #define BATADV_DAT_CANDIDATE_ORIG 1 /* Debug Messages */ #ifdef pr_fmt #undef pr_fmt #endif /* Append 'batman-adv: ' before kernel messages */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* Kernel headers */ #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/etherdevice.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/netdevice.h> #include <linux/percpu.h> #include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include "types.h" #include "main.h" /** * batadv_print_vid() - return printable version of vid information * @vid: the VLAN identifier * * Return: -1 when no VLAN is used, VLAN id otherwise */ static inline int batadv_print_vid(unsigned short vid) { if (vid & BATADV_VLAN_HAS_TAG) return (int)(vid & VLAN_VID_MASK); else return -1; } extern struct list_head batadv_hardif_list; extern unsigned int batadv_hardif_generation; extern unsigned char batadv_broadcast_addr[]; extern struct workqueue_struct *batadv_event_workqueue; int batadv_mesh_init(struct net_device *soft_iface); void batadv_mesh_free(struct net_device *soft_iface); bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr); int batadv_max_header_len(void); void batadv_skb_set_priority(struct sk_buff *skb, int offset); int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev); int batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)); void batadv_recv_handler_unregister(u8 packet_type); __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); /** * batadv_compare_eth() - Compare two not u16 aligned Ethernet addresses * @data1: Pointer to a six-byte array containing the Ethernet address * @data2: Pointer other six-byte array containing the Ethernet address * * note: can't use ether_addr_equal() as it requires aligned memory * * Return: true if they are the same ethernet addr */ static inline bool batadv_compare_eth(const void *data1, const void *data2) { return ether_addr_equal_unaligned(data1, data2); } /** * batadv_has_timed_out() - compares current time (jiffies) and timestamp + * timeout * @timestamp: base value to compare with (in jiffies) * @timeout: added to base value before comparing (in milliseconds) * * Return: true if current time is after timestamp + timeout */ static inline bool batadv_has_timed_out(unsigned long timestamp, unsigned int timeout) { return time_is_before_jiffies(timestamp + msecs_to_jiffies(timeout)); } /** * batadv_atomic_dec_not_zero() - Decrease unless the number is 0 * @v: pointer of type atomic_t * * Return: non-zero if v was not 0, and zero otherwise. */ #define batadv_atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0) /** * batadv_smallest_signed_int() - Returns the smallest signed integer in two's * complement with the sizeof x * @x: type of integer * * Return: smallest signed integer of type */ #define batadv_smallest_signed_int(x) (1u << (7u + 8u * (sizeof(x) - 1u))) /** * batadv_seq_before() - Checks if a sequence number x is a predecessor of y * @x: potential predecessor of @y * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a predecessor * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: * * * when adding nothing - it is neither a predecessor nor a successor * * before adding more than 127 to the starting value - it is a predecessor, * * when adding 128 - it is neither a predecessor nor a successor, * * after adding more than 127 to the starting value - it is a successor * * Return: true when x is a predecessor of y, false otherwise */ #define batadv_seq_before(x, y) ({ \ typeof(x)_d1 = (x); \ typeof(y)_d2 = (y); \ typeof(x)_dummy = (_d1 - _d2); \ (void)(&_d1 == &_d2); \ _dummy > batadv_smallest_signed_int(_dummy); \ }) /** * batadv_seq_after() - Checks if a sequence number x is a successor of y * @x: potential successor of @y * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a successor * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: * * * when adding nothing - it is neither a predecessor nor a successor * * before adding more than 127 to the starting value - it is a predecessor, * * when adding 128 - it is neither a predecessor nor a successor, * * after adding more than 127 to the starting value - it is a successor * * Return: true when x is a successor of y, false otherwise */ #define batadv_seq_after(x, y) batadv_seq_before(y, x) /** * batadv_add_counter() - Add to per cpu statistics counter of soft interface * @bat_priv: the bat priv with all the soft interface information * @idx: counter index which should be modified * @count: value to increase counter by * * Stop preemption on local cpu while incrementing the counter */ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, size_t count) { this_cpu_add(bat_priv->bat_counters[idx], count); } /** * batadv_inc_counter() - Increase per cpu statistics counter of soft interface * @b: the bat priv with all the soft interface information * @i: counter index which should be modified */ #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) /** * BATADV_SKB_CB() - Get batadv_skb_cb from skb control buffer * @__skb: skb holding the control buffer * * The members of the control buffer are defined in struct batadv_skb_cb in * types.h. The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h. * * Return: pointer to the batadv_skb_cb of the skb */ #define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0])) unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len); bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid); int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, enum batadv_uev_action action, const char *data); #endif /* _NET_BATMAN_ADV_MAIN_H_ */
3 18 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 #ifndef __NET_TUN_PROTO_H #define __NET_TUN_PROTO_H #include <linux/if_ether.h> #include <linux/types.h> /* One byte protocol values as defined by VXLAN-GPE and NSH. These will * hopefully get a shared IANA registry. */ #define TUN_P_IPV4 0x01 #define TUN_P_IPV6 0x02 #define TUN_P_ETHERNET 0x03 #define TUN_P_NSH 0x04 #define TUN_P_MPLS_UC 0x05 static inline __be16 tun_p_to_eth_p(u8 proto) { switch (proto) { case TUN_P_IPV4: return htons(ETH_P_IP); case TUN_P_IPV6: return htons(ETH_P_IPV6); case TUN_P_ETHERNET: return htons(ETH_P_TEB); case TUN_P_NSH: return htons(ETH_P_NSH); case TUN_P_MPLS_UC: return htons(ETH_P_MPLS_UC); } return 0; } static inline u8 tun_p_from_eth_p(__be16 proto) { switch (proto) { case htons(ETH_P_IP): return TUN_P_IPV4; case htons(ETH_P_IPV6): return TUN_P_IPV6; case htons(ETH_P_TEB): return TUN_P_ETHERNET; case htons(ETH_P_NSH): return TUN_P_NSH; case htons(ETH_P_MPLS_UC): return TUN_P_MPLS_UC; } return 0; } #endif
132 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/xattr.h Extended attributes handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #ifndef _LINUX_XATTR_H #define _LINUX_XATTR_H #include <linux/slab.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/user_namespace.h> #include <uapi/linux/xattr.h> /* List of all open_how "versions". */ #define XATTR_ARGS_SIZE_VER0 16 /* sizeof first published struct */ #define XATTR_ARGS_SIZE_LATEST XATTR_ARGS_SIZE_VER0 struct inode; struct dentry; static inline bool is_posix_acl_xattr(const char *name) { return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); } /* * struct xattr_handler: When @name is set, match attributes with exactly that * name. When @prefix is set instead, match attributes with that prefix and * with a non-empty suffix. */ struct xattr_handler { const char *name; const char *prefix; int flags; /* fs private flags */ bool (*list)(struct dentry *dentry); int (*get)(const struct xattr_handler *, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); }; /** * xattr_handler_can_list - check whether xattr can be listed * @handler: handler for this type of xattr * @dentry: dentry whose inode xattr to list * * Determine whether the xattr associated with @dentry can be listed given * @handler. * * Return: true if xattr can be listed, false if not. */ static inline bool xattr_handler_can_list(const struct xattr_handler *handler, struct dentry *dentry) { return handler && (!handler->list || handler->list(dentry)); } const char *xattr_full_name(const struct xattr_handler *, const char *); struct xattr { const char *name; void *value; size_t value_len; }; ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t); ssize_t vfs_getxattr(struct mnt_idmap *, struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); int __vfs_setxattr(struct mnt_idmap *, struct dentry *, struct inode *, const char *, const void *, size_t, int); int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int, struct inode **); int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *, const char *, struct inode **); int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); int xattr_supports_user_prefix(struct inode *inode); static inline const char *xattr_prefix(const struct xattr_handler *handler) { return handler->prefix ?: handler->name; } struct simple_xattrs { struct rb_root rb_root; rwlock_t lock; }; struct simple_xattr { struct rb_node rb_node; char *name; size_t size; char value[]; }; void simple_xattrs_init(struct simple_xattrs *xattrs); void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); #endif /* _LINUX_XATTR_H */
62 1319 1366 2 1361 1365 14 1361 1323 1321 1318 1320 1323 1320 1322 3 1319 1323 4 1320 1324 1322 1321 1323 1322 1321 1322 1318 1321 4 1319 2391 2392 48 48 48 48 48 48 1 47 48 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/fs_struct.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/prefetch.h> #include "mount.h" #include "internal.h" struct prepend_buffer { char *buf; int len; }; #define DECLARE_BUFFER(__name, __buf, __len) \ struct prepend_buffer __name = {.buf = __buf + __len, .len = __len} static char *extract_string(struct prepend_buffer *p) { if (likely(p->len >= 0)) return p->buf; return ERR_PTR(-ENAMETOOLONG); } static bool prepend_char(struct prepend_buffer *p, unsigned char c) { if (likely(p->len > 0)) { p->len--; *--p->buf = c; return true; } p->len = -1; return false; } /* * The source of the prepend data can be an optimistic load * of a dentry name and length. And because we don't hold any * locks, the length and the pointer to the name may not be * in sync if a concurrent rename happens, and the kernel * copy might fault as a result. * * The end result will correct itself when we check the * rename sequence count, but we need to be able to handle * the fault gracefully. */ static bool prepend_copy(void *dst, const void *src, int len) { if (unlikely(copy_from_kernel_nofault(dst, src, len))) { memset(dst, 'x', len); return false; } return true; } static bool prepend(struct prepend_buffer *p, const char *str, int namelen) { // Already overflowed? if (p->len < 0) return false; // Will overflow? if (p->len < namelen) { // Fill as much as possible from the end of the name str += namelen - p->len; p->buf -= p->len; prepend_copy(p->buf, str, p->len); p->len = -1; return false; } // Fits fully p->len -= namelen; p->buf -= namelen; return prepend_copy(p->buf, str, namelen); } /** * prepend_name - prepend a pathname in front of current buffer pointer * @p: prepend buffer which contains buffer pointer and allocated length * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. * But since the length cannot be trusted, we need to copy the name very * carefully when doing the prepend_copy(). It also prepends "/" at * the beginning of the name. The sequence number check at the caller will * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. * * Load acquire is needed to make sure that we see the new name data even * if we might get the length wrong. */ static bool prepend_name(struct prepend_buffer *p, const struct qstr *name) { const char *dname = smp_load_acquire(&name->name); /* ^^^ */ u32 dlen = READ_ONCE(name->len); return prepend(p, dname, dlen) && prepend_char(p, '/'); } static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, const struct path *root, struct prepend_buffer *p) { while (dentry != root->dentry || &mnt->mnt != root->mnt) { const struct dentry *parent = READ_ONCE(dentry->d_parent); if (dentry == mnt->mnt.mnt_root) { struct mount *m = READ_ONCE(mnt->mnt_parent); struct mnt_namespace *mnt_ns; if (likely(mnt != m)) { dentry = READ_ONCE(mnt->mnt_mountpoint); mnt = m; continue; } /* Global root */ mnt_ns = READ_ONCE(mnt->mnt_ns); /* open-coded is_mounted() to use local mnt_ns */ if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) return 1; // absolute root else return 2; // detached or not attached yet } if (unlikely(dentry == parent)) /* Escaped? */ return 3; prefetch(parent); if (!prepend_name(p, &dentry->d_name)) break; dentry = parent; } return 0; } /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @p: prepend buffer which contains buffer pointer and allocated length * * The function will first try to write out the pathname without taking any * lock other than the RCU read lock to make sure that dentries won't go away. * It only checks the sequence number of the global rename_lock as any change * in the dentry's d_seq will be preceded by changes in the rename_lock * sequence number. If the sequence number had been changed, it will restart * the whole pathname back-tracing sequence again by taking the rename_lock. * In this case, there is no need to take the RCU read lock as the recursive * parent pointer references will keep the dentry chain alive as long as no * rename operation is performed. */ static int prepend_path(const struct path *path, const struct path *root, struct prepend_buffer *p) { unsigned seq, m_seq = 0; struct prepend_buffer b; int error; rcu_read_lock(); restart_mnt: read_seqbegin_or_lock(&mount_lock, &m_seq); seq = 0; rcu_read_lock(); restart: b = *p; read_seqbegin_or_lock(&rename_lock, &seq); error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b); if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (!(m_seq & 1)) rcu_read_unlock(); if (need_seqretry(&mount_lock, m_seq)) { m_seq = 1; goto restart_mnt; } done_seqretry(&mount_lock, m_seq); if (unlikely(error == 3)) b = *p; if (b.len == p->len) prepend_char(&b, '/'); *p = b; return error; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. * * Returns a pointer into the buffer or an error code if the * path was too long. * * "buflen" should be positive. * * If the path is not reachable from the supplied root, return %NULL. */ char *__d_path(const struct path *path, const struct path *root, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, root, &b) > 0)) return NULL; return extract_string(&b); } char *d_absolute_path(const struct path *path, char *buf, int buflen) { struct path root = {}; DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, &root, &b) > 1)) return ERR_PTR(-EINVAL); return extract_string(&b); } static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); *root = fs->root; } while (read_seqcount_retry(&fs->seq, seq)); } /** * d_path - return the path of a dentry * @path: path to report * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. If the entry has been deleted * the string " (deleted)" is appended. Note that this is ambiguous. * * Returns a pointer into the buffer or an error code if the path was * too long. Note: Callers should use the returned pointer, not the passed * in buffer, to use the name! The implementation often starts at an offset * into the buffer, and may leave 0 bytes at the start. * * "buflen" should be positive. */ char *d_path(const struct path *path, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); struct path root; /* * We have various synthetic filesystems that never get mounted. On * these filesystems dentries are never used for lookup purposes, and * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: * * Some pseudo inodes are mountable. When they are mounted * path->dentry == path->mnt->mnt_root. In that case don't call d_dname * and instead have d_path return the mounted path. */ if (path->dentry->d_op && path->dentry->d_op->d_dname && (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); rcu_read_lock(); get_fs_root_rcu(current->fs, &root); if (unlikely(d_unlinked(path->dentry))) prepend(&b, " (deleted)", 11); else prepend_char(&b, 0); prepend_path(path, &root, &b); rcu_read_unlock(); return extract_string(&b); } EXPORT_SYMBOL(d_path); /* * Helper function for dentry_operations.d_dname() members */ char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...) { va_list args; char temp[64]; int sz; va_start(args, fmt); sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1; va_end(args); if (sz > sizeof(temp) || sz > buflen) return ERR_PTR(-ENAMETOOLONG); buffer += buflen - sz; return memcpy(buffer, temp, sz); } char *simple_dname(struct dentry *dentry, char *buffer, int buflen) { DECLARE_BUFFER(b, buffer, buflen); /* these dentries are never renamed, so d_lock is not needed */ prepend(&b, " (deleted)", 11); prepend(&b, dentry->d_name.name, dentry->d_name.len); prepend_char(&b, '/'); return extract_string(&b); } /* * Write full pathname from the root of the filesystem into the buffer. */ static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p) { const struct dentry *dentry; struct prepend_buffer b; int seq = 0; rcu_read_lock(); restart: dentry = d; b = *p; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { const struct dentry *parent = dentry->d_parent; prefetch(parent); if (!prepend_name(&b, &dentry->d_name)) break; dentry = parent; } if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (b.len == p->len) prepend_char(&b, '/'); return extract_string(&b); } char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); return __dentry_path(dentry, &b); } EXPORT_SYMBOL(dentry_path_raw); char *dentry_path(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); if (unlikely(d_unlinked(dentry))) prepend(&b, "//deleted", 10); else prepend_char(&b, 0); return __dentry_path(dentry, &b); } static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, struct path *pwd) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); *root = fs->root; *pwd = fs->pwd; } while (read_seqcount_retry(&fs->seq, seq)); } /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just * returns the length of the buffer filled (which * includes the ending '\0' character), or a negative * error value. So libc would do something like * * char *getcwd(char * buf, size_t size) * { * int retval; * * retval = sys_getcwd(buf, size); * if (retval >= 0) * return buf; * errno = -retval; * return NULL; * } */ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) { int error; struct path pwd, root; char *page = __getname(); if (!page) return -ENOMEM; rcu_read_lock(); get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); if (unlikely(d_unlinked(pwd.dentry))) { rcu_read_unlock(); error = -ENOENT; } else { unsigned len; DECLARE_BUFFER(b, page, PATH_MAX); prepend_char(&b, 0); if (unlikely(prepend_path(&pwd, &root, &b) > 0)) prepend(&b, "(unreachable)", 13); rcu_read_unlock(); len = PATH_MAX - b.len; if (unlikely(len > PATH_MAX)) error = -ENAMETOOLONG; else if (unlikely(len > size)) error = -ERANGE; else if (copy_to_user(buf, b.buf, len)) error = -EFAULT; else error = len; } __putname(page); return error; }
53 53 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 /* SPDX-License-Identifier: GPL-2.0 * * IO cost model based controller. * * Copyright (C) 2019 Tejun Heo <tj@kernel.org> * Copyright (C) 2019 Andy Newell <newella@fb.com> * Copyright (C) 2019 Facebook * * One challenge of controlling IO resources is the lack of trivially * observable cost metric. This is distinguished from CPU and memory where * wallclock time and the number of bytes can serve as accurate enough * approximations. * * Bandwidth and iops are the most commonly used metrics for IO devices but * depending on the type and specifics of the device, different IO patterns * easily lead to multiple orders of magnitude variations rendering them * useless for the purpose of IO capacity distribution. While on-device * time, with a lot of clutches, could serve as a useful approximation for * non-queued rotational devices, this is no longer viable with modern * devices, even the rotational ones. * * While there is no cost metric we can trivially observe, it isn't a * complete mystery. For example, on a rotational device, seek cost * dominates while a contiguous transfer contributes a smaller amount * proportional to the size. If we can characterize at least the relative * costs of these different types of IOs, it should be possible to * implement a reasonable work-conserving proportional IO resource * distribution. * * 1. IO Cost Model * * IO cost model estimates the cost of an IO given its basic parameters and * history (e.g. the end sector of the last IO). The cost is measured in * device time. If a given IO is estimated to cost 10ms, the device should * be able to process ~100 of those IOs in a second. * * Currently, there's only one builtin cost model - linear. Each IO is * classified as sequential or random and given a base cost accordingly. * On top of that, a size cost proportional to the length of the IO is * added. While simple, this model captures the operational * characteristics of a wide varienty of devices well enough. Default * parameters for several different classes of devices are provided and the * parameters can be configured from userspace via * /sys/fs/cgroup/io.cost.model. * * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate * device-specific coefficients. * * 2. Control Strategy * * The device virtual time (vtime) is used as the primary control metric. * The control strategy is composed of the following three parts. * * 2-1. Vtime Distribution * * When a cgroup becomes active in terms of IOs, its hierarchical share is * calculated. Please consider the following hierarchy where the numbers * inside parentheses denote the configured weights. * * root * / \ * A (w:100) B (w:300) * / \ * A0 (w:100) A1 (w:100) * * If B is idle and only A0 and A1 are actively issuing IOs, as the two are * of equal weight, each gets 50% share. If then B starts issuing IOs, B * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, * 12.5% each. The distribution mechanism only cares about these flattened * shares. They're called hweights (hierarchical weights) and always add * upto 1 (WEIGHT_ONE). * * A given cgroup's vtime runs slower in inverse proportion to its hweight. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) * against the device vtime - an IO which takes 10ms on the underlying * device is considered to take 80ms on A0. * * This constitutes the basis of IO capacity distribution. Each cgroup's * vtime is running at a rate determined by its hweight. A cgroup tracks * the vtime consumed by past IOs and can issue a new IO if doing so * wouldn't outrun the current device vtime. Otherwise, the IO is * suspended until the vtime has progressed enough to cover it. * * 2-2. Vrate Adjustment * * It's unrealistic to expect the cost model to be perfect. There are too * many devices and even on the same device the overall performance * fluctuates depending on numerous factors such as IO mixture and device * internal garbage collection. The controller needs to adapt dynamically. * * This is achieved by adjusting the overall IO rate according to how busy * the device is. If the device becomes overloaded, we're sending down too * many IOs and should generally slow down. If there are waiting issuers * but the device isn't saturated, we're issuing too few and should * generally speed up. * * To slow down, we lower the vrate - the rate at which the device vtime * passes compared to the wall clock. For example, if the vtime is running * at the vrate of 75%, all cgroups added up would only be able to issue * 750ms worth of IOs per second, and vice-versa for speeding up. * * Device business is determined using two criteria - rq wait and * completion latencies. * * When a device gets saturated, the on-device and then the request queues * fill up and a bio which is ready to be issued has to wait for a request * to become available. When this delay becomes noticeable, it's a clear * indication that the device is saturated and we lower the vrate. This * saturation signal is fairly conservative as it only triggers when both * hardware and software queues are filled up, and is used as the default * busy signal. * * As devices can have deep queues and be unfair in how the queued commands * are executed, solely depending on rq wait may not result in satisfactory * control quality. For a better control quality, completion latency QoS * parameters can be configured so that the device is considered saturated * if N'th percentile completion latency rises above the set point. * * The completion latency requirements are a function of both the * underlying device characteristics and the desired IO latency quality of * service. There is an inherent trade-off - the tighter the latency QoS, * the higher the bandwidth lossage. Latency QoS is disabled by default * and can be set through /sys/fs/cgroup/io.cost.qos. * * 2-3. Work Conservation * * Imagine two cgroups A and B with equal weights. A is issuing a small IO * periodically while B is sending out enough parallel IOs to saturate the * device on its own. Let's say A's usage amounts to 100ms worth of IO * cost per second, i.e., 10% of the device capacity. The naive * distribution of half and half would lead to 60% utilization of the * device, a significant reduction in the total amount of work done * compared to free-for-all competition. This is too high a cost to pay * for IO control. * * To conserve the total amount of work done, we keep track of how much * each active cgroup is actually using and yield part of its weight if * there are other cgroups which can make use of it. In the above case, * A's weight will be lowered so that it hovers above the actual usage and * B would be able to use the rest. * * As we don't want to penalize a cgroup for donating its weight, the * surplus weight adjustment factors in a margin and has an immediate * snapback mechanism in case the cgroup needs more IO vtime for itself. * * Note that adjusting down surplus weights has the same effects as * accelerating vtime for other cgroups and work conservation can also be * implemented by adjusting vrate dynamically. However, squaring who can * donate and should take back how much requires hweight propagations * anyway making it easier to implement and understand as a separate * mechanism. * * 3. Monitoring * * Instead of debugfs or other clumsy monitoring mechanisms, this * controller uses a drgn based monitoring script - * tools/cgroup/iocost_monitor.py. For details on drgn, please see * https://github.com/osandov/drgn. The output looks like the following. * * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% * active weight hweight% inflt% dbt delay usages% * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033 * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077 * * - per : Timer period * - cur_per : Internal wall and device vtime clock * - vrate : Device virtual time rate against wall clock * - weight : Surplus-adjusted and configured weights * - hweight : Surplus-adjusted and configured hierarchical weights * - inflt : The percentage of in-flight IO cost at the end of last period * - del_ms : Deferred issuer delay induction level and duration * - usages : Usage history */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/time64.h> #include <linux/parser.h> #include <linux/sched/signal.h> #include <asm/local.h> #include <asm/local64.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-wbt.h" #include "blk-cgroup.h" #ifdef CONFIG_TRACEPOINTS /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */ #define TRACE_IOCG_PATH_LEN 1024 static DEFINE_SPINLOCK(trace_iocg_path_lock); static char trace_iocg_path[TRACE_IOCG_PATH_LEN]; #define TRACE_IOCG_PATH(type, iocg, ...) \ do { \ unsigned long flags; \ if (trace_iocost_##type##_enabled()) { \ spin_lock_irqsave(&trace_iocg_path_lock, flags); \ cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \ trace_iocg_path, TRACE_IOCG_PATH_LEN); \ trace_iocost_##type(iocg, trace_iocg_path, \ ##__VA_ARGS__); \ spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \ } \ } while (0) #else /* CONFIG_TRACE_POINTS */ #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0) #endif /* CONFIG_TRACE_POINTS */ enum { MILLION = 1000000, /* timer period is calculated from latency requirements, bound it */ MIN_PERIOD = USEC_PER_MSEC, MAX_PERIOD = USEC_PER_SEC, /* * iocg->vtime is targeted at 50% behind the device vtime, which * serves as its IO credit buffer. Surplus weight adjustment is * immediately canceled if the vtime margin runs below 10%. */ MARGIN_MIN_PCT = 10, MARGIN_LOW_PCT = 20, MARGIN_TARGET_PCT = 50, INUSE_ADJ_STEP_PCT = 25, /* Have some play in timer operations */ TIMER_SLACK_PCT = 1, /* 1/64k is granular enough and can easily be handled w/ u32 */ WEIGHT_ONE = 1 << 16, }; enum { /* * As vtime is used to calculate the cost of each IO, it needs to * be fairly high precision. For example, it should be able to * represent the cost of a single page worth of discard with * suffificient accuracy. At the same time, it should be able to * represent reasonably long enough durations to be useful and * convenient during operation. * * 1s worth of vtime is 2^37. This gives us both sub-nanosecond * granularity and days of wrap-around time even at extreme vrates. */ VTIME_PER_SEC_SHIFT = 37, VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT, VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC, VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC, /* bound vrate adjustments within two orders of magnitude */ VRATE_MIN_PPM = 10000, /* 1% */ VRATE_MAX_PPM = 100000000, /* 10000% */ VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION, VRATE_CLAMP_ADJ_PCT = 4, /* switch iff the conditions are met for longer than this */ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, }; enum { /* if IOs end up waiting for requests, issue less */ RQ_WAIT_BUSY_PCT = 5, /* unbusy hysterisis */ UNBUSY_THR_PCT = 75, /* * The effect of delay is indirect and non-linear and a huge amount of * future debt can accumulate abruptly while unthrottled. Linearly scale * up delay as debt is going up and then let it decay exponentially. * This gives us quick ramp ups while delay is accumulating and long * tails which can help reducing the frequency of debt explosions on * unthrottle. The parameters are experimentally determined. * * The delay mechanism provides adequate protection and behavior in many * cases. However, this is far from ideal and falls shorts on both * fronts. The debtors are often throttled too harshly costing a * significant level of fairness and possibly total work while the * protection against their impacts on the system can be choppy and * unreliable. * * The shortcoming primarily stems from the fact that, unlike for page * cache, the kernel doesn't have well-defined back-pressure propagation * mechanism and policies for anonymous memory. Fully addressing this * issue will likely require substantial improvements in the area. */ MIN_DELAY_THR_PCT = 500, MAX_DELAY_THR_PCT = 25000, MIN_DELAY = 250, MAX_DELAY = 250 * USEC_PER_MSEC, /* halve debts if avg usage over 100ms is under 50% */ DFGV_USAGE_PCT = 50, DFGV_PERIOD = 100 * USEC_PER_MSEC, /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, /* * Count IO size in 4k pages. The 12bit shift helps keeping * size-proportional components of cost calculation in closer * numbers of digits to per-IO cost components. */ IOC_PAGE_SHIFT = 12, IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT, IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT, /* if apart further than 16M, consider randio for linear model */ LCOEF_RANDIO_PAGES = 4096, }; enum ioc_running { IOC_IDLE, IOC_RUNNING, IOC_STOP, }; /* io.cost.qos controls including per-dev enable of the whole controller */ enum { QOS_ENABLE, QOS_CTRL, NR_QOS_CTRL_PARAMS, }; /* io.cost.qos params */ enum { QOS_RPPM, QOS_RLAT, QOS_WPPM, QOS_WLAT, QOS_MIN, QOS_MAX, NR_QOS_PARAMS, }; /* io.cost.model controls */ enum { COST_CTRL, COST_MODEL, NR_COST_CTRL_PARAMS, }; /* builtin linear cost model coefficients */ enum { I_LCOEF_RBPS, I_LCOEF_RSEQIOPS, I_LCOEF_RRANDIOPS, I_LCOEF_WBPS, I_LCOEF_WSEQIOPS, I_LCOEF_WRANDIOPS, NR_I_LCOEFS, }; enum { LCOEF_RPAGE, LCOEF_RSEQIO, LCOEF_RRANDIO, LCOEF_WPAGE, LCOEF_WSEQIO, LCOEF_WRANDIO, NR_LCOEFS, }; enum { AUTOP_INVALID, AUTOP_HDD, AUTOP_SSD_QD1, AUTOP_SSD_DFL, AUTOP_SSD_FAST, }; struct ioc_params { u32 qos[NR_QOS_PARAMS]; u64 i_lcoefs[NR_I_LCOEFS]; u64 lcoefs[NR_LCOEFS]; u32 too_fast_vrate_pct; u32 too_slow_vrate_pct; }; struct ioc_margins { s64 min; s64 low; s64 target; }; struct ioc_missed { local_t nr_met; local_t nr_missed; u32 last_met; u32 last_missed; }; struct ioc_pcpu_stat { struct ioc_missed missed[2]; local64_t rq_wait_ns; u64 last_rq_wait_ns; }; /* per device */ struct ioc { struct rq_qos rqos; bool enabled; struct ioc_params params; struct ioc_margins margins; u32 period_us; u32 timer_slack_ns; u64 vrate_min; u64 vrate_max; spinlock_t lock; struct timer_list timer; struct list_head active_iocgs; /* active cgroups */ struct ioc_pcpu_stat __percpu *pcpu_stat; enum ioc_running running; atomic64_t vtime_rate; u64 vtime_base_rate; s64 vtime_err; seqcount_spinlock_t period_seqcount; u64 period_at; /* wallclock starttime */ u64 period_at_vtime; /* vtime starttime */ atomic64_t cur_period; /* inc'd each period */ int busy_level; /* saturation history */ bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ /* debt forgivness */ u64 dfgv_period_at; u64 dfgv_period_rem; u64 dfgv_usage_us_sum; u64 autop_too_fast_at; u64 autop_too_slow_at; int autop_idx; bool user_qos_params:1; bool user_cost_model:1; }; struct iocg_pcpu_stat { local64_t abs_vusage; }; struct iocg_stat { u64 usage_us; u64 wait_us; u64 indebt_us; u64 indelay_us; }; /* per device-cgroup pair */ struct ioc_gq { struct blkg_policy_data pd; struct ioc *ioc; /* * A iocg can get its weight from two sources - an explicit * per-device-cgroup configuration or the default weight of the * cgroup. `cfg_weight` is the explicit per-device-cgroup * configuration. `weight` is the effective considering both * sources. * * When an idle cgroup becomes active its `active` goes from 0 to * `weight`. `inuse` is the surplus adjusted active weight. * `active` and `inuse` are used to calculate `hweight_active` and * `hweight_inuse`. * * `last_inuse` remembers `inuse` while an iocg is idle to persist * surplus adjustments. * * `inuse` may be adjusted dynamically during period. `saved_*` are used * to determine and track adjustments. */ u32 cfg_weight; u32 weight; u32 active; u32 inuse; u32 last_inuse; s64 saved_margin; sector_t cursor; /* to detect randio */ /* * `vtime` is this iocg's vtime cursor which progresses as IOs are * issued. If lagging behind device vtime, the delta represents * the currently available IO budget. If running ahead, the * overage. * * `vtime_done` is the same but progressed on completion rather * than issue. The delta behind `vtime` represents the cost of * currently in-flight IOs. */ atomic64_t vtime; atomic64_t done_vtime; u64 abs_vdebt; /* current delay in effect and when it started */ u64 delay; u64 delay_at; /* * The period this iocg was last active in. Used for deactivation * and invalidating `vtime`. */ atomic64_t active_period; struct list_head active_list; /* see __propagate_weights() and current_hweight() for details */ u64 child_active_sum; u64 child_inuse_sum; u64 child_adjusted_sum; int hweight_gen; u32 hweight_active; u32 hweight_inuse; u32 hweight_donating; u32 hweight_after_donation; struct list_head walk_list; struct list_head surplus_list; struct wait_queue_head waitq; struct hrtimer waitq_timer; /* timestamp at the latest activation */ u64 activated_at; /* statistics */ struct iocg_pcpu_stat __percpu *pcpu_stat; struct iocg_stat stat; struct iocg_stat last_stat; u64 last_stat_abs_vusage; u64 usage_delta_us; u64 wait_since; u64 indebt_since; u64 indelay_since; /* this iocg's depth in the hierarchy and ancestors including self */ int level; struct ioc_gq *ancestors[]; }; /* per cgroup */ struct ioc_cgrp { struct blkcg_policy_data cpd; unsigned int dfl_weight; }; struct ioc_now { u64 now_ns; u64 now; u64 vnow; }; struct iocg_wait { struct wait_queue_entry wait; struct bio *bio; u64 abs_cost; bool committed; }; struct iocg_wake_ctx { struct ioc_gq *iocg; u32 hw_inuse; s64 vbudget; }; static const struct ioc_params autop[] = { [AUTOP_HDD] = { .qos = { [QOS_RLAT] = 250000, /* 250ms */ [QOS_WLAT] = 250000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 174019176, [I_LCOEF_RSEQIOPS] = 41708, [I_LCOEF_RRANDIOPS] = 370, [I_LCOEF_WBPS] = 178075866, [I_LCOEF_WSEQIOPS] = 42705, [I_LCOEF_WRANDIOPS] = 378, }, }, [AUTOP_SSD_QD1] = { .qos = { [QOS_RLAT] = 25000, /* 25ms */ [QOS_WLAT] = 25000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 245855193, [I_LCOEF_RSEQIOPS] = 61575, [I_LCOEF_RRANDIOPS] = 6946, [I_LCOEF_WBPS] = 141365009, [I_LCOEF_WSEQIOPS] = 33716, [I_LCOEF_WRANDIOPS] = 26796, }, }, [AUTOP_SSD_DFL] = { .qos = { [QOS_RLAT] = 25000, /* 25ms */ [QOS_WLAT] = 25000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 488636629, [I_LCOEF_RSEQIOPS] = 8932, [I_LCOEF_RRANDIOPS] = 8518, [I_LCOEF_WBPS] = 427891549, [I_LCOEF_WSEQIOPS] = 28755, [I_LCOEF_WRANDIOPS] = 21940, }, .too_fast_vrate_pct = 500, }, [AUTOP_SSD_FAST] = { .qos = { [QOS_RLAT] = 5000, /* 5ms */ [QOS_WLAT] = 5000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 3102524156LLU, [I_LCOEF_RSEQIOPS] = 724816, [I_LCOEF_RRANDIOPS] = 778122, [I_LCOEF_WBPS] = 1742780862LLU, [I_LCOEF_WSEQIOPS] = 425702, [I_LCOEF_WRANDIOPS] = 443193, }, .too_slow_vrate_pct = 10, }, }; /* * vrate adjust percentages indexed by ioc->busy_level. We adjust up on * vtime credit shortage and down on device saturation. */ static const u32 vrate_adj_pct[] = { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 }; static struct blkcg_policy blkcg_policy_iocost; /* accessors and helpers */ static struct ioc *rqos_to_ioc(struct rq_qos *rqos) { return container_of(rqos, struct ioc, rqos); } static struct ioc *q_to_ioc(struct request_queue *q) { return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); } static const char __maybe_unused *ioc_name(struct ioc *ioc) { struct gendisk *disk = ioc->rqos.disk; if (!disk) return "<unknown>"; return disk->disk_name; } static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct ioc_gq, pd) : NULL; } static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg) { return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost)); } static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg) { return pd_to_blkg(&iocg->pd); } static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost), struct ioc_cgrp, cpd); } /* * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical * weight, the more expensive each IO. Must round up. */ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) { return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse); } /* * The inverse of abs_cost_to_cost(). Must round up. */ static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) { return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); } static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 abs_cost, u64 cost) { struct iocg_pcpu_stat *gcs; bio->bi_iocost_cost = cost; atomic64_add(cost, &iocg->vtime); gcs = get_cpu_ptr(iocg->pcpu_stat); local64_add(abs_cost, &gcs->abs_vusage); put_cpu_ptr(gcs); } static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) { if (lock_ioc) { spin_lock_irqsave(&iocg->ioc->lock, *flags); spin_lock(&iocg->waitq.lock); } else { spin_lock_irqsave(&iocg->waitq.lock, *flags); } } static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags) { if (unlock_ioc) { spin_unlock(&iocg->waitq.lock); spin_unlock_irqrestore(&iocg->ioc->lock, *flags); } else { spin_unlock_irqrestore(&iocg->waitq.lock, *flags); } } #define CREATE_TRACE_POINTS #include <trace/events/iocost.h> static void ioc_refresh_margins(struct ioc *ioc) { struct ioc_margins *margins = &ioc->margins; u32 period_us = ioc->period_us; u64 vrate = ioc->vtime_base_rate; margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; } /* latency Qos params changed, update period_us and all the dependent params */ static void ioc_refresh_period_us(struct ioc *ioc) { u32 ppm, lat, multi, period_us; lockdep_assert_held(&ioc->lock); /* pick the higher latency target */ if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) { ppm = ioc->params.qos[QOS_RPPM]; lat = ioc->params.qos[QOS_RLAT]; } else { ppm = ioc->params.qos[QOS_WPPM]; lat = ioc->params.qos[QOS_WLAT]; } /* * We want the period to be long enough to contain a healthy number * of IOs while short enough for granular control. Define it as a * multiple of the latency target. Ideally, the multiplier should * be scaled according to the percentile so that it would nominally * contain a certain number of requests. Let's be simpler and * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50). */ if (ppm) multi = max_t(u32, (MILLION - ppm) / 50000, 2); else multi = 2; period_us = multi * lat; period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD); /* calculate dependent params */ ioc->period_us = period_us; ioc->timer_slack_ns = div64_u64( (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT, 100); ioc_refresh_margins(ioc); } /* * ioc->rqos.disk isn't initialized when this function is called from * the init path. */ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) { int idx = ioc->autop_idx; const struct ioc_params *p = &autop[idx]; u32 vrate_pct; u64 now_ns; /* rotational? */ if (!blk_queue_nonrot(disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ if (blk_queue_depth(disk->queue) == 1) return AUTOP_SSD_QD1; /* use one of the normal ssd sets */ if (idx < AUTOP_SSD_DFL) return AUTOP_SSD_DFL; /* if user is overriding anything, maintain what was there */ if (ioc->user_qos_params || ioc->user_cost_model) return idx; /* step up/down based on the vrate */ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); now_ns = blk_time_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { if (!ioc->autop_too_fast_at) ioc->autop_too_fast_at = now_ns; if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC) return idx + 1; } else { ioc->autop_too_fast_at = 0; } if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) { if (!ioc->autop_too_slow_at) ioc->autop_too_slow_at = now_ns; if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC) return idx - 1; } else { ioc->autop_too_slow_at = 0; } return idx; } /* * Take the followings as input * * @bps maximum sequential throughput * @seqiops maximum sequential 4k iops * @randiops maximum random 4k iops * * and calculate the linear model cost coefficients. * * *@page per-page cost 1s / (@bps / 4096) * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0) * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0) */ static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops, u64 *page, u64 *seqio, u64 *randio) { u64 v; *page = *seqio = *randio = 0; if (bps) { u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE); if (bps_pages) *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages); else *page = 1; } if (seqiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops); if (v > *page) *seqio = v - *page; } if (randiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops); if (v > *page) *randio = v - *page; } } static void ioc_refresh_lcoefs(struct ioc *ioc) { u64 *u = ioc->params.i_lcoefs; u64 *c = ioc->params.lcoefs; calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]); calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS], &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]); } /* * struct gendisk is required as an argument because ioc->rqos.disk * is not properly initialized when called from the init path. */ static bool ioc_refresh_params_disk(struct ioc *ioc, bool force, struct gendisk *disk) { const struct ioc_params *p; int idx; lockdep_assert_held(&ioc->lock); idx = ioc_autop_idx(ioc, disk); p = &autop[idx]; if (idx == ioc->autop_idx && !force) return false; if (idx != ioc->autop_idx) { atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); ioc->vtime_base_rate = VTIME_PER_USEC; } ioc->autop_idx = idx; ioc->autop_too_fast_at = 0; ioc->autop_too_slow_at = 0; if (!ioc->user_qos_params) memcpy(ioc->params.qos, p->qos, sizeof(p->qos)); if (!ioc->user_cost_model) memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs)); ioc_refresh_period_us(ioc); ioc_refresh_lcoefs(ioc); ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * VTIME_PER_USEC, MILLION); ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] * VTIME_PER_USEC, MILLION); return true; } static bool ioc_refresh_params(struct ioc *ioc, bool force) { return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk); } /* * When an iocg accumulates too much vtime or gets deactivated, we throw away * some vtime, which lowers the overall device utilization. As the exact amount * which is being thrown away is known, we can compensate by accelerating the * vrate accordingly so that the extra vtime generated in the current period * matches what got lost. */ static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) { s64 pleft = ioc->period_at + ioc->period_us - now->now; s64 vperiod = ioc->period_us * ioc->vtime_base_rate; s64 vcomp, vcomp_min, vcomp_max; lockdep_assert_held(&ioc->lock); /* we need some time left in this period */ if (pleft <= 0) goto done; /* * Calculate how much vrate should be adjusted to offset the error. * Limit the amount of adjustment and deduct the adjusted amount from * the error. */ vcomp = -div64_s64(ioc->vtime_err, pleft); vcomp_min = -(ioc->vtime_base_rate >> 1); vcomp_max = ioc->vtime_base_rate; vcomp = clamp(vcomp, vcomp_min, vcomp_max); ioc->vtime_err += vcomp * pleft; atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); done: /* bound how much error can accumulate */ ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); } static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, int nr_lagging, int nr_shortages, int prev_busy_level, u32 *missed_ppm) { u64 vrate = ioc->vtime_base_rate; u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { if (ioc->busy_level != prev_busy_level || nr_lagging) trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); return; } /* * If vrate is out of bounds, apply clamp gradually as the * bounds can change abruptly. Otherwise, apply busy_level * based adjustment. */ if (vrate < vrate_min) { vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100); vrate = min(vrate, vrate_min); } else if (vrate > vrate_max) { vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); vrate = max(vrate, vrate_max); } else { int idx = min_t(int, abs(ioc->busy_level), ARRAY_SIZE(vrate_adj_pct) - 1); u32 adj_pct = vrate_adj_pct[idx]; if (ioc->busy_level > 0) adj_pct = 100 - adj_pct; else adj_pct = 100 + adj_pct; vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), vrate_min, vrate_max); } trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); ioc->vtime_base_rate = vrate; ioc_refresh_margins(ioc); } /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { unsigned seq; u64 vrate; now->now_ns = blk_time_get_ns(); now->now = ktime_to_us(now->now_ns); vrate = atomic64_read(&ioc->vtime_rate); /* * The current vtime is * * vtime at period start + (wallclock time since the start) * vrate * * As a consistent snapshot of `period_at_vtime` and `period_at` is * needed, they're seqcount protected. */ do { seq = read_seqcount_begin(&ioc->period_seqcount); now->vnow = ioc->period_at_vtime + (now->now - ioc->period_at) * vrate; } while (read_seqcount_retry(&ioc->period_seqcount, seq)); } static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) { WARN_ON_ONCE(ioc->running != IOC_RUNNING); write_seqcount_begin(&ioc->period_seqcount); ioc->period_at = now->now; ioc->period_at_vtime = now->vnow; write_seqcount_end(&ioc->period_seqcount); ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us); add_timer(&ioc->timer); } /* * Update @iocg's `active` and `inuse` to @active and @inuse, update level * weight sums and propagate upwards accordingly. If @save, the current margin * is saved to be used as reference for later inuse in-period adjustments. */ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, bool save, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; int lvl; lockdep_assert_held(&ioc->lock); /* * For an active leaf node, its inuse shouldn't be zero or exceed * @active. An active internal node's inuse is solely determined by the * inuse to active ratio of its children regardless of @inuse. */ if (list_empty(&iocg->active_list) && iocg->child_active_sum) { inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, iocg->child_active_sum); } else { /* * It may be tempting to turn this into a clamp expression with * a lower limit of 1 but active may be 0, which cannot be used * as an upper limit in that situation. This expression allows * active to clamp inuse unless it is 0, in which case inuse * becomes 1. */ inuse = min(inuse, active) ?: 1; } iocg->last_inuse = iocg->inuse; if (save) iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); if (active == iocg->active && inuse == iocg->inuse) return; for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; u32 parent_active = 0, parent_inuse = 0; /* update the level sums */ parent->child_active_sum += (s32)(active - child->active); parent->child_inuse_sum += (s32)(inuse - child->inuse); /* apply the updates */ child->active = active; child->inuse = inuse; /* * The delta between inuse and active sums indicates that * much of weight is being given away. Parent's inuse * and active should reflect the ratio. */ if (parent->child_active_sum) { parent_active = parent->weight; parent_inuse = DIV64_U64_ROUND_UP( parent_active * parent->child_inuse_sum, parent->child_active_sum); } /* do we need to keep walking up? */ if (parent_active == parent->active && parent_inuse == parent->inuse) break; active = parent_active; inuse = parent_inuse; } ioc->weights_updated = true; } static void commit_weights(struct ioc *ioc) { lockdep_assert_held(&ioc->lock); if (ioc->weights_updated) { /* paired with rmb in current_hweight(), see there */ smp_wmb(); atomic_inc(&ioc->hweight_gen); ioc->weights_updated = false; } } static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, bool save, struct ioc_now *now) { __propagate_weights(iocg, active, inuse, save, now); commit_weights(iocg->ioc); } static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) { struct ioc *ioc = iocg->ioc; int lvl; u32 hwa, hwi; int ioc_gen; /* hot path - if uptodate, use cached */ ioc_gen = atomic_read(&ioc->hweight_gen); if (ioc_gen == iocg->hweight_gen) goto out; /* * Paired with wmb in commit_weights(). If we saw the updated * hweight_gen, all the weight updates from __propagate_weights() are * visible too. * * We can race with weight updates during calculation and get it * wrong. However, hweight_gen would have changed and a future * reader will recalculate and we're guaranteed to discard the * wrong result soon. */ smp_rmb(); hwa = hwi = WEIGHT_ONE; for (lvl = 0; lvl <= iocg->level - 1; lvl++) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; u64 active_sum = READ_ONCE(parent->child_active_sum); u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); u32 active = READ_ONCE(child->active); u32 inuse = READ_ONCE(child->inuse); /* we can race with deactivations and either may read as zero */ if (!active_sum || !inuse_sum) continue; active_sum = max_t(u64, active, active_sum); hwa = div64_u64((u64)hwa * active, active_sum); inuse_sum = max_t(u64, inuse, inuse_sum); hwi = div64_u64((u64)hwi * inuse, inuse_sum); } iocg->hweight_active = max_t(u32, hwa, 1); iocg->hweight_inuse = max_t(u32, hwi, 1); iocg->hweight_gen = ioc_gen; out: if (hw_activep) *hw_activep = iocg->hweight_active; if (hw_inusep) *hw_inusep = iocg->hweight_inuse; } /* * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the * other weights stay unchanged. */ static u32 current_hweight_max(struct ioc_gq *iocg) { u32 hwm = WEIGHT_ONE; u32 inuse = iocg->active; u64 child_inuse_sum; int lvl; lockdep_assert_held(&iocg->ioc->lock); for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; hwm = div64_u64((u64)hwm * inuse, child_inuse_sum); inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, parent->child_active_sum); } return max_t(u32, hwm, 1); } static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg); u32 weight; lockdep_assert_held(&ioc->lock); weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) propagate_weights(iocg, weight, iocg->inuse, true, now); iocg->weight = weight; } static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 __maybe_unused last_period, cur_period; u64 vtime, vtarget; int i; /* * If seem to be already active, just update the stamp to tell the * timer that we're still active. We don't mind occassional races. */ if (!list_empty(&iocg->active_list)) { ioc_now(ioc, now); cur_period = atomic64_read(&ioc->cur_period); if (atomic64_read(&iocg->active_period) != cur_period) atomic64_set(&iocg->active_period, cur_period); return true; } /* racy check on internal node IOs, treat as root level IOs */ if (iocg->child_active_sum) return false; spin_lock_irq(&ioc->lock); ioc_now(ioc, now); /* update period */ cur_period = atomic64_read(&ioc->cur_period); last_period = atomic64_read(&iocg->active_period); atomic64_set(&iocg->active_period, cur_period); /* already activated or breaking leaf-only constraint? */ if (!list_empty(&iocg->active_list)) goto succeed_unlock; for (i = iocg->level - 1; i > 0; i--) if (!list_empty(&iocg->ancestors[i]->active_list)) goto fail_unlock; if (iocg->child_active_sum) goto fail_unlock; /* * Always start with the target budget. On deactivation, we throw away * anything above it. */ vtarget = now->vnow - ioc->margins.target; vtime = atomic64_read(&iocg->vtime); atomic64_add(vtarget - vtime, &iocg->vtime); atomic64_add(vtarget - vtime, &iocg->done_vtime); vtime = vtarget; /* * Activate, propagate weight and start period timer if not * running. Reset hweight_gen to avoid accidental match from * wrapping. */ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; list_add(&iocg->active_list, &ioc->active_iocgs); propagate_weights(iocg, iocg->weight, iocg->last_inuse ?: iocg->weight, true, now); TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); iocg->activated_at = now->now; if (ioc->running == IOC_IDLE) { ioc->running = IOC_RUNNING; ioc->dfgv_period_at = now->now; ioc->dfgv_period_rem = 0; ioc_start_period(ioc, now); } succeed_unlock: spin_unlock_irq(&ioc->lock); return true; fail_unlock: spin_unlock_irq(&ioc->lock); return false; } static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 tdelta, delay, new_delay, shift; s64 vover, vover_pct; u32 hwa; lockdep_assert_held(&iocg->waitq.lock); /* * If the delay is set by another CPU, we may be in the past. No need to * change anything if so. This avoids decay calculation underflow. */ if (time_before64(now->now, iocg->delay_at)) return false; /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; shift = div64_u64(tdelta, USEC_PER_SEC); if (iocg->delay && shift < BITS_PER_LONG) delay = iocg->delay >> shift; else delay = 0; /* calculate the new delay from the debt amount */ current_hweight(iocg, &hwa, NULL); vover = atomic64_read(&iocg->vtime) + abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; vover_pct = div64_s64(100 * vover, ioc->period_us * ioc->vtime_base_rate); if (vover_pct <= MIN_DELAY_THR_PCT) new_delay = 0; else if (vover_pct >= MAX_DELAY_THR_PCT) new_delay = MAX_DELAY; else new_delay = MIN_DELAY + div_u64((MAX_DELAY - MIN_DELAY) * (vover_pct - MIN_DELAY_THR_PCT), MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); /* pick the higher one and apply */ if (new_delay > delay) { iocg->delay = new_delay; iocg->delay_at = now->now; delay = new_delay; } if (delay >= MIN_DELAY) { if (!iocg->indelay_since) iocg->indelay_since = now->now; blkcg_set_delay(blkg, delay * NSEC_PER_USEC); return true; } else { if (iocg->indelay_since) { iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = 0; } iocg->delay = 0; blkcg_clear_delay(blkg); return false; } } static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, struct ioc_now *now) { struct iocg_pcpu_stat *gcs; lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); WARN_ON_ONCE(list_empty(&iocg->active_list)); /* * Once in debt, debt handling owns inuse. @iocg stays at the minimum * inuse donating all of it share to others until its debt is paid off. */ if (!iocg->abs_vdebt && abs_cost) { iocg->indebt_since = now->now; propagate_weights(iocg, iocg->active, 0, false, now); } iocg->abs_vdebt += abs_cost; gcs = get_cpu_ptr(iocg->pcpu_stat); local64_add(abs_cost, &gcs->abs_vusage); put_cpu_ptr(gcs); } static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, struct ioc_now *now) { lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); /* * make sure that nobody messed with @iocg. Check iocg->pd.online * to avoid warn when removing blkcg or disk. */ WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online); WARN_ON_ONCE(iocg->inuse > 1); iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); /* if debt is paid in full, restore inuse */ if (!iocg->abs_vdebt) { iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = 0; propagate_weights(iocg, iocg->active, iocg->last_inuse, false, now); } } static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); struct iocg_wake_ctx *ctx = key; u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); ctx->vbudget -= cost; if (ctx->vbudget < 0) return -1; iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); wait->committed = true; /* * autoremove_wake_function() removes the wait entry only when it * actually changed the task state. We want the wait always removed. * Remove explicitly and use default_wake_function(). Note that the * order of operations is important as finish_wait() tests whether * @wq_entry is removed without grabbing the lock. */ default_wake_function(wq_entry, mode, flags, key); list_del_init_careful(&wq_entry->entry); return 0; } /* * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in * addition to iocg->waitq.lock. */ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct iocg_wake_ctx ctx = { .iocg = iocg }; u64 vshortage, expires, oexpires; s64 vbudget; u32 hwa; lockdep_assert_held(&iocg->waitq.lock); current_hweight(iocg, &hwa, NULL); vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ if (pay_debt && iocg->abs_vdebt && vbudget > 0) { u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa); u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); u64 vpay = abs_cost_to_cost(abs_vpay, hwa); lockdep_assert_held(&ioc->lock); atomic64_add(vpay, &iocg->vtime); atomic64_add(vpay, &iocg->done_vtime); iocg_pay_debt(iocg, abs_vpay, now); vbudget -= vpay; } if (iocg->abs_vdebt || iocg->delay) iocg_kick_delay(iocg, now); /* * Debt can still be outstanding if we haven't paid all yet or the * caller raced and called without @pay_debt. Shouldn't wake up waiters * under debt. Make sure @vbudget reflects the outstanding amount and is * not positive. */ if (iocg->abs_vdebt) { s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); vbudget = min_t(s64, 0, vbudget - vdebt); } /* * Wake up the ones which are due and see how much vtime we'll need for * the next one. As paying off debt restores hw_inuse, it must be read * after the above debt payment. */ ctx.vbudget = vbudget; current_hweight(iocg, NULL, &ctx.hw_inuse); __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); if (!waitqueue_active(&iocg->waitq)) { if (iocg->wait_since) { iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = 0; } return; } if (!iocg->wait_since) iocg->wait_since = now->now; if (WARN_ON_ONCE(ctx.vbudget >= 0)) return; /* determine next wakeup, add a timer margin to guarantee chunking */ vshortage = -ctx.vbudget; expires = now->now_ns + DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * NSEC_PER_USEC; expires += ioc->timer_slack_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); if (hrtimer_is_queued(&iocg->waitq_timer) && abs(oexpires - expires) <= ioc->timer_slack_ns) return; hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), ioc->timer_slack_ns, HRTIMER_MODE_ABS); } static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) { struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); bool pay_debt = READ_ONCE(iocg->abs_vdebt); struct ioc_now now; unsigned long flags; ioc_now(iocg->ioc, &now); iocg_lock(iocg, pay_debt, &flags); iocg_kick_waitq(iocg, pay_debt, &now); iocg_unlock(iocg, pay_debt, &flags); return HRTIMER_NORESTART; } static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) { u32 nr_met[2] = { }; u32 nr_missed[2] = { }; u64 rq_wait_ns = 0; int cpu, rw; for_each_online_cpu(cpu) { struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu); u64 this_rq_wait_ns; for (rw = READ; rw <= WRITE; rw++) { u32 this_met = local_read(&stat->missed[rw].nr_met); u32 this_missed = local_read(&stat->missed[rw].nr_missed); nr_met[rw] += this_met - stat->missed[rw].last_met; nr_missed[rw] += this_missed - stat->missed[rw].last_missed; stat->missed[rw].last_met = this_met; stat->missed[rw].last_missed = this_missed; } this_rq_wait_ns = local64_read(&stat->rq_wait_ns); rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; stat->last_rq_wait_ns = this_rq_wait_ns; } for (rw = READ; rw <= WRITE; rw++) { if (nr_met[rw] + nr_missed[rw]) missed_ppm_ar[rw] = DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION, nr_met[rw] + nr_missed[rw]); else missed_ppm_ar[rw] = 0; } *rq_wait_pct_p = div64_u64(rq_wait_ns * 100, ioc->period_us * NSEC_PER_USEC); } /* was iocg idle this period? */ static bool iocg_is_idle(struct ioc_gq *iocg) { struct ioc *ioc = iocg->ioc; /* did something get issued this period? */ if (atomic64_read(&iocg->active_period) == atomic64_read(&ioc->cur_period)) return false; /* is something in flight? */ if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime)) return false; return true; } /* * Call this function on the target leaf @iocg's to build pre-order traversal * list of all the ancestors in @inner_walk. The inner nodes are linked through * ->walk_list and the caller is responsible for dissolving the list after use. */ static void iocg_build_inner_walk(struct ioc_gq *iocg, struct list_head *inner_walk) { int lvl; WARN_ON_ONCE(!list_empty(&iocg->walk_list)); /* find the first ancestor which hasn't been visited yet */ for (lvl = iocg->level - 1; lvl >= 0; lvl--) { if (!list_empty(&iocg->ancestors[lvl]->walk_list)) break; } /* walk down and visit the inner nodes to get pre-order traversal */ while (++lvl <= iocg->level - 1) { struct ioc_gq *inner = iocg->ancestors[lvl]; /* record traversal order */ list_add_tail(&inner->walk_list, inner_walk); } } /* propagate the deltas to the parent */ static void iocg_flush_stat_upward(struct ioc_gq *iocg) { if (iocg->level > 0) { struct iocg_stat *parent_stat = &iocg->ancestors[iocg->level - 1]->stat; parent_stat->usage_us += iocg->stat.usage_us - iocg->last_stat.usage_us; parent_stat->wait_us += iocg->stat.wait_us - iocg->last_stat.wait_us; parent_stat->indebt_us += iocg->stat.indebt_us - iocg->last_stat.indebt_us; parent_stat->indelay_us += iocg->stat.indelay_us - iocg->last_stat.indelay_us; } iocg->last_stat = iocg->stat; } /* collect per-cpu counters and propagate the deltas to the parent */ static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 abs_vusage = 0; u64 vusage_delta; int cpu; lockdep_assert_held(&iocg->ioc->lock); /* collect per-cpu counters */ for_each_possible_cpu(cpu) { abs_vusage += local64_read( per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); } vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; iocg->last_stat_abs_vusage = abs_vusage; iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); iocg->stat.usage_us += iocg->usage_delta_us; iocg_flush_stat_upward(iocg); } /* get stat counters ready for reading on all active iocgs */ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) { LIST_HEAD(inner_walk); struct ioc_gq *iocg, *tiocg; /* flush leaves and build inner node walk list */ list_for_each_entry(iocg, target_iocgs, active_list) { iocg_flush_stat_leaf(iocg, now); iocg_build_inner_walk(iocg, &inner_walk); } /* keep flushing upwards by walking the inner list backwards */ list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { iocg_flush_stat_upward(iocg); list_del_init(&iocg->walk_list); } } /* * Determine what @iocg's hweight_inuse should be after donating unused * capacity. @hwm is the upper bound and used to signal no donation. This * function also throws away @iocg's excess budget. */ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm, u32 usage, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 vtime = atomic64_read(&iocg->vtime); s64 excess, delta, target, new_hwi; /* debt handling owns inuse for debtors */ if (iocg->abs_vdebt) return 1; /* see whether minimum margin requirement is met */ if (waitqueue_active(&iocg->waitq) || time_after64(vtime, now->vnow - ioc->margins.min)) return hwm; /* throw away excess above target */ excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { atomic64_add(excess, &iocg->vtime); atomic64_add(excess, &iocg->done_vtime); vtime += excess; ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); } /* * Let's say the distance between iocg's and device's vtimes as a * fraction of period duration is delta. Assuming that the iocg will * consume the usage determined above, we want to determine new_hwi so * that delta equals MARGIN_TARGET at the end of the next period. * * We need to execute usage worth of IOs while spending the sum of the * new budget (1 - MARGIN_TARGET) and the leftover from the last period * (delta): * * usage = (1 - MARGIN_TARGET + delta) * new_hwi * * Therefore, the new_hwi is: * * new_hwi = usage / (1 - MARGIN_TARGET + delta) */ delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), now->vnow - ioc->period_at_vtime); target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100; new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); return clamp_t(s64, new_hwi, 1, hwm); } /* * For work-conservation, an iocg which isn't using all of its share should * donate the leftover to other iocgs. There are two ways to achieve this - 1. * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight. * * #1 is mathematically simpler but has the drawback of requiring synchronous * global hweight_inuse updates when idle iocg's get activated or inuse weights * change due to donation snapbacks as it has the possibility of grossly * overshooting what's allowed by the model and vrate. * * #2 is inherently safe with local operations. The donating iocg can easily * snap back to higher weights when needed without worrying about impacts on * other nodes as the impacts will be inherently correct. This also makes idle * iocg activations safe. The only effect activations have is decreasing * hweight_inuse of others, the right solution to which is for those iocgs to * snap back to higher weights. * * So, we go with #2. The challenge is calculating how each donating iocg's * inuse should be adjusted to achieve the target donation amounts. This is done * using Andy's method described in the following pdf. * * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo * * Given the weights and target after-donation hweight_inuse values, Andy's * method determines how the proportional distribution should look like at each * sibling level to maintain the relative relationship between all non-donating * pairs. To roughly summarize, it divides the tree into donating and * non-donating parts, calculates global donation rate which is used to * determine the target hweight_inuse for each node, and then derives per-level * proportions. * * The following pdf shows that global distribution calculated this way can be * achieved by scaling inuse weights of donating leaves and propagating the * adjustments upwards proportionally. * * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE * * Combining the above two, we can determine how each leaf iocg's inuse should * be adjusted to achieve the target donation. * * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN * * The inline comments use symbols from the last pdf. * * b is the sum of the absolute budgets in the subtree. 1 for the root node. * f is the sum of the absolute budgets of non-donating nodes in the subtree. * t is the sum of the absolute budgets of donating nodes in the subtree. * w is the weight of the node. w = w_f + w_t * w_f is the non-donating portion of w. w_f = w * f / b * w_b is the donating portion of w. w_t = w * t / b * s is the sum of all sibling weights. s = Sum(w) for siblings * s_f and s_t are the non-donating and donating portions of s. * * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g. * w_pt is the donating portion of the parent's weight and w'_pt the same value * after adjustments. Subscript r denotes the root node's values. */ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) { LIST_HEAD(over_hwa); LIST_HEAD(inner_walk); struct ioc_gq *iocg, *tiocg, *root_iocg; u32 after_sum, over_sum, over_target, gamma; /* * It's pretty unlikely but possible for the total sum of * hweight_after_donation's to be higher than WEIGHT_ONE, which will * confuse the following calculations. If such condition is detected, * scale down everyone over its full share equally to keep the sum below * WEIGHT_ONE. */ after_sum = 0; over_sum = 0; list_for_each_entry(iocg, surpluses, surplus_list) { u32 hwa; current_hweight(iocg, &hwa, NULL); after_sum += iocg->hweight_after_donation; if (iocg->hweight_after_donation > hwa) { over_sum += iocg->hweight_after_donation; list_add(&iocg->walk_list, &over_hwa); } } if (after_sum >= WEIGHT_ONE) { /* * The delta should be deducted from the over_sum, calculate * target over_sum value. */ u32 over_delta = after_sum - (WEIGHT_ONE - 1); WARN_ON_ONCE(over_sum <= over_delta); over_target = over_sum - over_delta; } else { over_target = 0; } list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) { if (over_target) iocg->hweight_after_donation = div_u64((u64)iocg->hweight_after_donation * over_target, over_sum); list_del_init(&iocg->walk_list); } /* * Build pre-order inner node walk list and prepare for donation * adjustment calculations. */ list_for_each_entry(iocg, surpluses, surplus_list) { iocg_build_inner_walk(iocg, &inner_walk); } root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list); WARN_ON_ONCE(root_iocg->level > 0); list_for_each_entry(iocg, &inner_walk, walk_list) { iocg->child_adjusted_sum = 0; iocg->hweight_donating = 0; iocg->hweight_after_donation = 0; } /* * Propagate the donating budget (b_t) and after donation budget (b'_t) * up the hierarchy. */ list_for_each_entry(iocg, surpluses, surplus_list) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; parent->hweight_donating += iocg->hweight_donating; parent->hweight_after_donation += iocg->hweight_after_donation; } list_for_each_entry_reverse(iocg, &inner_walk, walk_list) { if (iocg->level > 0) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; parent->hweight_donating += iocg->hweight_donating; parent->hweight_after_donation += iocg->hweight_after_donation; } } /* * Calculate inner hwa's (b) and make sure the donation values are * within the accepted ranges as we're doing low res calculations with * roundups. */ list_for_each_entry(iocg, &inner_walk, walk_list) { if (iocg->level) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; iocg->hweight_active = DIV64_U64_ROUND_UP( (u64)parent->hweight_active * iocg->active, parent->child_active_sum); } iocg->hweight_donating = min(iocg->hweight_donating, iocg->hweight_active); iocg->hweight_after_donation = min(iocg->hweight_after_donation, iocg->hweight_donating - 1); if (WARN_ON_ONCE(iocg->hweight_active <= 1 || iocg->hweight_donating <= 1 || iocg->hweight_after_donation == 0)) { pr_warn("iocg: invalid donation weights in "); pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); pr_cont(": active=%u donating=%u after=%u\n", iocg->hweight_active, iocg->hweight_donating, iocg->hweight_after_donation); } } /* * Calculate the global donation rate (gamma) - the rate to adjust * non-donating budgets by. * * No need to use 64bit multiplication here as the first operand is * guaranteed to be smaller than WEIGHT_ONE (1<<16). * * We know that there are beneficiary nodes and the sum of the donating * hweights can't be whole; however, due to the round-ups during hweight * calculations, root_iocg->hweight_donating might still end up equal to * or greater than whole. Limit the range when calculating the divider. * * gamma = (1 - t_r') / (1 - t_r) */ gamma = DIV_ROUND_UP( (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); /* * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner * nodes. */ list_for_each_entry(iocg, &inner_walk, walk_list) { struct ioc_gq *parent; u32 inuse, wpt, wptp; u64 st, sf; if (iocg->level == 0) { /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */ iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), WEIGHT_ONE - iocg->hweight_after_donation); continue; } parent = iocg->ancestors[iocg->level - 1]; /* b' = gamma * b_f + b_t' */ iocg->hweight_inuse = DIV64_U64_ROUND_UP( (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), WEIGHT_ONE) + iocg->hweight_after_donation; /* w' = s' * b' / b'_p */ inuse = DIV64_U64_ROUND_UP( (u64)parent->child_adjusted_sum * iocg->hweight_inuse, parent->hweight_inuse); /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */ st = DIV64_U64_ROUND_UP( iocg->child_active_sum * iocg->hweight_donating, iocg->hweight_active); sf = iocg->child_active_sum - st; wpt = DIV64_U64_ROUND_UP( (u64)iocg->active * iocg->hweight_donating, iocg->hweight_active); wptp = DIV64_U64_ROUND_UP( (u64)inuse * iocg->hweight_after_donation, iocg->hweight_inuse); iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); } /* * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and * we can finally determine leaf adjustments. */ list_for_each_entry(iocg, surpluses, surplus_list) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; u32 inuse; /* * In-debt iocgs participated in the donation calculation with * the minimum target hweight_inuse. Configuring inuse * accordingly would work fine but debt handling expects * @iocg->inuse stay at the minimum and we don't wanna * interfere. */ if (iocg->abs_vdebt) { WARN_ON_ONCE(iocg->inuse > 1); continue; } /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ inuse = DIV64_U64_ROUND_UP( parent->child_adjusted_sum * iocg->hweight_after_donation, parent->hweight_inuse); TRACE_IOCG_PATH(inuse_transfer, iocg, now, iocg->inuse, inuse, iocg->hweight_inuse, iocg->hweight_after_donation); __propagate_weights(iocg, iocg->active, inuse, true, now); } /* walk list should be dissolved after use */ list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list) list_del_init(&iocg->walk_list); } /* * A low weight iocg can amass a large amount of debt, for example, when * anonymous memory gets reclaimed aggressively. If the system has a lot of * memory paired with a slow IO device, the debt can span multiple seconds or * more. If there are no other subsequent IO issuers, the in-debt iocg may end * up blocked paying its debt while the IO device is idle. * * The following protects against such cases. If the device has been * sufficiently idle for a while, the debts are halved and delays are * recalculated. */ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, struct ioc_now *now) { struct ioc_gq *iocg; u64 dur, usage_pct, nr_cycles, nr_cycles_shift; /* if no debtor, reset the cycle */ if (!nr_debtors) { ioc->dfgv_period_at = now->now; ioc->dfgv_period_rem = 0; ioc->dfgv_usage_us_sum = 0; return; } /* * Debtors can pass through a lot of writes choking the device and we * don't want to be forgiving debts while the device is struggling from * write bursts. If we're missing latency targets, consider the device * fully utilized. */ if (ioc->busy_level > 0) usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us); ioc->dfgv_usage_us_sum += usage_us_sum; if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD)) return; /* * At least DFGV_PERIOD has passed since the last period. Calculate the * average usage and reset the period counters. */ dur = now->now - ioc->dfgv_period_at; usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur); ioc->dfgv_period_at = now->now; ioc->dfgv_usage_us_sum = 0; /* if was too busy, reset everything */ if (usage_pct > DFGV_USAGE_PCT) { ioc->dfgv_period_rem = 0; return; } /* * Usage is lower than threshold. Let's forgive some debts. Debt * forgiveness runs off of the usual ioc timer but its period usually * doesn't match ioc's. Compensate the difference by performing the * reduction as many times as would fit in the duration since the last * run and carrying over the left-over duration in @ioc->dfgv_period_rem * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive * reductions is doubled. */ nr_cycles = dur + ioc->dfgv_period_rem; ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { u64 __maybe_unused old_debt, __maybe_unused old_delay; if (!iocg->abs_vdebt && !iocg->delay) continue; spin_lock(&iocg->waitq.lock); old_debt = iocg->abs_vdebt; old_delay = iocg->delay; nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1); if (iocg->abs_vdebt) iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1; if (iocg->delay) iocg->delay = iocg->delay >> nr_cycles_shift ?: 1; iocg_kick_waitq(iocg, true, now); TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct, old_debt, iocg->abs_vdebt, old_delay, iocg->delay); spin_unlock(&iocg->waitq.lock); } } /* * Check the active iocgs' state to avoid oversleeping and deactive * idle iocgs. * * Since waiters determine the sleep durations based on the vrate * they saw at the time of sleep, if vrate has increased, some * waiters could be sleeping for too long. Wake up tardy waiters * which should have woken up in the last period and expire idle * iocgs. */ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) { int nr_debtors = 0; struct ioc_gq *iocg, *tiocg; list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && !iocg->delay && !iocg_is_idle(iocg)) continue; spin_lock(&iocg->waitq.lock); /* flush wait and indebt stat deltas */ if (iocg->wait_since) { iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = now->now; } if (iocg->indebt_since) { iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = now->now; } if (iocg->indelay_since) { iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = now->now; } if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, true, now); if (iocg->abs_vdebt || iocg->delay) nr_debtors++; } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ u64 vtime = atomic64_read(&iocg->vtime); s64 excess; /* * @iocg has been inactive for a full duration and will * have a high budget. Account anything above target as * error and throw away. On reactivation, it'll start * with the target budget. */ excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { u32 old_hwi; current_hweight(iocg, NULL, &old_hwi); ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); } TRACE_IOCG_PATH(iocg_idle, iocg, now, atomic64_read(&iocg->active_period), atomic64_read(&ioc->cur_period), vtime); __propagate_weights(iocg, 0, 0, false, now); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } commit_weights(ioc); return nr_debtors; } static void ioc_timer_fn(struct timer_list *timer) { struct ioc *ioc = container_of(timer, struct ioc, timer); struct ioc_gq *iocg, *tiocg; struct ioc_now now; LIST_HEAD(surpluses); int nr_debtors, nr_shortages = 0, nr_lagging = 0; u64 usage_us_sum = 0; u32 ppm_rthr; u32 ppm_wthr; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; int prev_busy_level; /* how were the latencies during the period? */ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); /* take care of active iocgs */ spin_lock_irq(&ioc->lock); ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; ioc_now(ioc, &now); period_vtime = now.vnow - ioc->period_at_vtime; if (WARN_ON_ONCE(!period_vtime)) { spin_unlock_irq(&ioc->lock); return; } nr_debtors = ioc_check_iocgs(ioc, &now); /* * Wait and indebt stat are flushed above and the donation calculation * below needs updated usage stat. Let's bring stat up-to-date. */ iocg_flush_stat(&ioc->active_iocgs, &now); /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { u64 vdone, vtime, usage_us; u32 hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent * iocgs from accumulating a large amount of budget. */ vdone = atomic64_read(&iocg->done_vtime); vtime = atomic64_read(&iocg->vtime); current_hweight(iocg, &hw_active, &hw_inuse); /* * Latency QoS detection doesn't account for IOs which are * in-flight for longer than a period. Detect them by * comparing vdone against period start. If lagging behind * IOs from past periods, don't increase vrate. */ if ((ppm_rthr != MILLION || ppm_wthr != MILLION) && !atomic_read(&iocg_to_blkg(iocg)->use_delay) && time_after64(vtime, vdone) && time_after64(vtime, now.vnow - MAX_LAGGING_PERIODS * period_vtime) && time_before64(vdone, now.vnow - period_vtime)) nr_lagging++; /* * Determine absolute usage factoring in in-flight IOs to avoid * high-latency completions appearing as idle. */ usage_us = iocg->usage_delta_us; usage_us_sum += usage_us; /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { u32 hwa, old_hwi, hwm, new_hwi, usage; u64 usage_dur; if (vdone != vtime) { u64 inflight_us = DIV64_U64_ROUND_UP( cost_to_abs_cost(vtime - vdone, hw_inuse), ioc->vtime_base_rate); usage_us = max(usage_us, inflight_us); } /* convert to hweight based usage ratio */ if (time_after64(iocg->activated_at, ioc->period_at)) usage_dur = max_t(u64, now.now - iocg->activated_at, 1); else usage_dur = max_t(u64, now.now - ioc->period_at, 1); usage = clamp_t(u32, DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, usage_dur), 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. * Determine the donation amount. */ current_hweight(iocg, &hwa, &old_hwi); hwm = current_hweight_max(iocg); new_hwi = hweight_after_donation(iocg, old_hwi, hwm, usage, &now); /* * Donation calculation assumes hweight_after_donation * to be positive, a condition that a donor w/ hwa < 2 * can't meet. Don't bother with donation if hwa is * below 2. It's not gonna make a meaningful difference * anyway. */ if (new_hwi < hwm && hwa >= 2) { iocg->hweight_donating = hwa; iocg->hweight_after_donation = new_hwi; list_add(&iocg->surplus_list, &surpluses); } else if (!iocg->abs_vdebt) { /* * @iocg doesn't have enough to donate. Reset * its inuse to active. * * Don't reset debtors as their inuse's are * owned by debt handling. This shouldn't affect * donation calculuation in any meaningful way * as @iocg doesn't have a meaningful amount of * share anyway. */ TRACE_IOCG_PATH(inuse_shortage, iocg, &now, iocg->inuse, iocg->active, iocg->hweight_inuse, new_hwi); __propagate_weights(iocg, iocg->active, iocg->active, true, &now); nr_shortages++; } } else { /* genuinely short on vtime */ nr_shortages++; } } if (!list_empty(&surpluses) && nr_shortages) transfer_surpluses(&surpluses, &now); commit_weights(ioc); /* surplus list should be dissolved after use */ list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) list_del_init(&iocg->surplus_list); /* * If q is getting clogged or we're missing too much, we're issuing * too much IO and should lower vtime rate. If we're not missing * and experiencing shortages but not surpluses, we're too stingy * and should increase vtime rate. */ prev_busy_level = ioc->busy_level; if (rq_wait_pct > RQ_WAIT_BUSY_PCT || missed_ppm[READ] > ppm_rthr || missed_ppm[WRITE] > ppm_wthr) { /* clearly missing QoS targets, slow down vrate */ ioc->busy_level = max(ioc->busy_level, 0); ioc->busy_level++; } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 && missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 && missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) { /* QoS targets are being met with >25% margin */ if (nr_shortages) { /* * We're throttling while the device has spare * capacity. If vrate was being slowed down, stop. */ ioc->busy_level = min(ioc->busy_level, 0); /* * If there are IOs spanning multiple periods, wait * them out before pushing the device harder. */ if (!nr_lagging) ioc->busy_level--; } else { /* * Nobody is being throttled and the users aren't * issuing enough IOs to saturate the device. We * simply don't know how close the device is to * saturation. Coast. */ ioc->busy_level = 0; } } else { /* inside the hysterisis margin, we're good */ ioc->busy_level = 0; } ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages, prev_busy_level, missed_ppm); ioc_refresh_params(ioc, false); ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now); /* * This period is done. Move onto the next one. If nothing's * going on with the device, stop the timer. */ atomic64_inc(&ioc->cur_period); if (ioc->running != IOC_STOP) { if (!list_empty(&ioc->active_iocgs)) { ioc_start_period(ioc, &now); } else { ioc->busy_level = 0; ioc->vtime_err = 0; ioc->running = IOC_IDLE; } ioc_refresh_vrate(ioc, &now); } spin_unlock_irq(&ioc->lock); } static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, u64 abs_cost, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct ioc_margins *margins = &ioc->margins; u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; u32 hwi, adj_step; s64 margin; u64 cost, new_inuse; unsigned long flags; current_hweight(iocg, NULL, &hwi); old_hwi = hwi; cost = abs_cost_to_cost(abs_cost, hwi); margin = now->vnow - vtime - cost; /* debt handling owns inuse for debtors */ if (iocg->abs_vdebt) return cost; /* * We only increase inuse during period and do so if the margin has * deteriorated since the previous adjustment. */ if (margin >= iocg->saved_margin || margin >= margins->low || iocg->inuse == iocg->active) return cost; spin_lock_irqsave(&ioc->lock, flags); /* we own inuse only when @iocg is in the normal active state */ if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { spin_unlock_irqrestore(&ioc->lock, flags); return cost; } /* * Bump up inuse till @abs_cost fits in the existing budget. * adj_step must be determined after acquiring ioc->lock - we might * have raced and lost to another thread for activation and could * be reading 0 iocg->active before ioc->lock which will lead to * infinite loop. */ new_inuse = iocg->inuse; adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); do { new_inuse = new_inuse + adj_step; propagate_weights(iocg, iocg->active, new_inuse, true, now); current_hweight(iocg, NULL, &hwi); cost = abs_cost_to_cost(abs_cost, hwi); } while (time_after64(vtime + cost, now->vnow) && iocg->inuse != iocg->active); spin_unlock_irqrestore(&ioc->lock, flags); TRACE_IOCG_PATH(inuse_adjust, iocg, now, old_inuse, iocg->inuse, old_hwi, hwi); return cost; } static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, bool is_merge, u64 *costp) { struct ioc *ioc = iocg->ioc; u64 coef_seqio, coef_randio, coef_page; u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1); u64 seek_pages = 0; u64 cost = 0; /* Can't calculate cost for empty bio */ if (!bio->bi_iter.bi_size) goto out; switch (bio_op(bio)) { case REQ_OP_READ: coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO]; coef_page = ioc->params.lcoefs[LCOEF_RPAGE]; break; case REQ_OP_WRITE: coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO]; coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO]; coef_page = ioc->params.lcoefs[LCOEF_WPAGE]; break; default: goto out; } if (iocg->cursor) { seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor); seek_pages >>= IOC_SECT_TO_PAGE_SHIFT; } if (!is_merge) { if (seek_pages > LCOEF_RANDIO_PAGES) { cost += coef_randio; } else { cost += coef_seqio; } } cost += pages * coef_page; out: *costp = cost; } static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge) { u64 cost; calc_vtime_cost_builtin(bio, iocg, is_merge, &cost); return cost; } static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc, u64 *costp) { unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT; switch (req_op(rq)) { case REQ_OP_READ: *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE]; break; case REQ_OP_WRITE: *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE]; break; default: *costp = 0; } } static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc) { u64 cost; calc_size_vtime_cost_builtin(rq, ioc, &cost); return cost; } static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; struct ioc *ioc = rqos_to_ioc(rqos); struct ioc_gq *iocg = blkg_to_iocg(blkg); struct ioc_now now; struct iocg_wait wait; u64 abs_cost, cost, vtime; bool use_debt, ioc_locked; unsigned long flags; /* bypass IOs if disabled, still initializing, or for root cgroup */ if (!ioc->enabled || !iocg || !iocg->level) return; /* calculate the absolute vtime cost */ abs_cost = calc_vtime_cost(bio, iocg, false); if (!abs_cost) return; if (!iocg_activate(iocg, &now)) return; iocg->cursor = bio_end_sector(bio); vtime = atomic64_read(&iocg->vtime); cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* * If no one's waiting and within budget, issue right away. The * tests are racy but the races aren't systemic - we only miss once * in a while which is fine. */ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && time_before_eq64(vtime + cost, now.vnow)) { iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * We're over budget. This can be handled in two ways. IOs which may * cause priority inversions are punted to @ioc->aux_iocg and charged as * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine * whether debt handling is needed and acquire locks accordingly. */ use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); retry_lock: iocg_lock(iocg, ioc_locked, &flags); /* * @iocg must stay activated for debt and waitq handling. Deactivation * is synchronized against both ioc->lock and waitq.lock and we won't * get deactivated as long as we're waiting or has debt, so we're good * if we're activated here. In the unlikely cases that we aren't, just * issue the IO. */ if (unlikely(list_empty(&iocg->active_list))) { iocg_unlock(iocg, ioc_locked, &flags); iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * We're over budget. If @bio has to be issued regardless, remember * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay * off the debt before waking more IOs. * * This way, the debt is continuously paid off each period with the * actual budget available to the cgroup. If we just wound vtime, we * would incorrectly use the current hw_inuse for the entire amount * which, for example, can lead to the cgroup staying blocked for a * long time even with substantially raised hw_inuse. * * An iocg with vdebt should stay online so that the timer can keep * deducting its vdebt and [de]activate use_delay mechanism * accordingly. We don't want to race against the timer trying to * clear them and leave @iocg inactive w/ dangling use_delay heavily * penalizing the cgroup and its descendants. */ if (use_debt) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); iocg_unlock(iocg, ioc_locked, &flags); return; } /* guarantee that iocgs w/ waiters have maximum inuse */ if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { if (!ioc_locked) { iocg_unlock(iocg, false, &flags); ioc_locked = true; goto retry_lock; } propagate_weights(iocg, iocg->active, iocg->active, true, &now); } /* * Append self to the waitq and schedule the wakeup timer if we're * the first waiter. The timer duration is calculated based on the * current vrate. vtime and hweight changes can make it too short * or too long. Each wait entry records the absolute cost it's * waiting for to allow re-evaluation using a custom wait entry. * * If too short, the timer simply reschedules itself. If too long, * the period timer will notice and trigger wakeups. * * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); wait.wait.private = current; wait.bio = bio; wait.abs_cost = abs_cost; wait.committed = false; /* will be set true by waker */ __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); iocg_kick_waitq(iocg, ioc_locked, &now); iocg_unlock(iocg, ioc_locked, &flags); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); if (wait.committed) break; io_schedule(); } /* waker already committed us, proceed */ finish_wait(&iocg->waitq, &wait.wait); } static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); struct ioc *ioc = rqos_to_ioc(rqos); sector_t bio_end = bio_end_sector(bio); struct ioc_now now; u64 vtime, abs_cost, cost; unsigned long flags; /* bypass if disabled, still initializing, or for root cgroup */ if (!ioc->enabled || !iocg || !iocg->level) return; abs_cost = calc_vtime_cost(bio, iocg, true); if (!abs_cost) return; ioc_now(ioc, &now); vtime = atomic64_read(&iocg->vtime); cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* update cursor if backmerging into the request at the cursor */ if (blk_rq_pos(rq) < bio_end && blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor) iocg->cursor = bio_end; /* * Charge if there's enough vtime budget and the existing request has * cost assigned. */ if (rq->bio && rq->bio->bi_iocost_cost && time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * Otherwise, account it as debt if @iocg is online, which it should * be for the vast majority of cases. See debt handling in * ioc_rqos_throttle() for details. */ spin_lock_irqsave(&ioc->lock, flags); spin_lock(&iocg->waitq.lock); if (likely(!list_empty(&iocg->active_list))) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); } spin_unlock(&iocg->waitq.lock); spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) { struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); if (iocg && bio->bi_iocost_cost) atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime); } static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) { struct ioc *ioc = rqos_to_ioc(rqos); struct ioc_pcpu_stat *ccs; u64 on_q_ns, rq_wait_ns, size_nsec; int pidx, rw; if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) return; switch (req_op(rq)) { case REQ_OP_READ: pidx = QOS_RLAT; rw = READ; break; case REQ_OP_WRITE: pidx = QOS_WLAT; rw = WRITE; break; default: return; } on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); ccs = get_cpu_ptr(ioc->pcpu_stat); if (on_q_ns <= size_nsec || on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) local_inc(&ccs->missed[rw].nr_met); else local_inc(&ccs->missed[rw].nr_missed); local64_add(rq_wait_ns, &ccs->rq_wait_ns); put_cpu_ptr(ccs); } static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); spin_lock_irq(&ioc->lock); ioc_refresh_params(ioc, false); spin_unlock_irq(&ioc->lock); } static void ioc_rqos_exit(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost); spin_lock_irq(&ioc->lock); ioc->running = IOC_STOP; spin_unlock_irq(&ioc->lock); timer_shutdown_sync(&ioc->timer); free_percpu(ioc->pcpu_stat); kfree(ioc); } static const struct rq_qos_ops ioc_rqos_ops = { .throttle = ioc_rqos_throttle, .merge = ioc_rqos_merge, .done_bio = ioc_rqos_done_bio, .done = ioc_rqos_done, .queue_depth_changed = ioc_rqos_queue_depth_changed, .exit = ioc_rqos_exit, }; static int blk_iocost_init(struct gendisk *disk) { struct ioc *ioc; int i, cpu, ret; ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); if (!ioc) return -ENOMEM; ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat); if (!ioc->pcpu_stat) { kfree(ioc); return -ENOMEM; } for_each_possible_cpu(cpu) { struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { local_set(&ccs->missed[i].nr_met, 0); local_set(&ccs->missed[i].nr_missed, 0); } local64_set(&ccs->rq_wait_ns, 0); } spin_lock_init(&ioc->lock); timer_setup(&ioc->timer, ioc_timer_fn, 0); INIT_LIST_HEAD(&ioc->active_iocgs); ioc->running = IOC_IDLE; ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); ioc->period_at = ktime_to_us(blk_time_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0); spin_lock_irq(&ioc->lock); ioc->autop_idx = AUTOP_INVALID; ioc_refresh_params_disk(ioc, true, disk); spin_unlock_irq(&ioc->lock); /* * rqos must be added before activation to allow ioc_pd_init() to * lookup the ioc from q. This means that the rqos methods may get * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. */ ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); if (ret) goto err_free_ioc; ret = blkcg_activate_policy(disk, &blkcg_policy_iocost); if (ret) goto err_del_qos; return 0; err_del_qos: rq_qos_del(&ioc->rqos); err_free_ioc: free_percpu(ioc->pcpu_stat); kfree(ioc); return ret; } static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) { struct ioc_cgrp *iocc; iocc = kzalloc(sizeof(struct ioc_cgrp), gfp); if (!iocc) return NULL; iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; return &iocc->cpd; } static void ioc_cpd_free(struct blkcg_policy_data *cpd) { kfree(container_of(cpd, struct ioc_cgrp, cpd)); } static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, disk->node_id); if (!iocg) return NULL; iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); if (!iocg->pcpu_stat) { kfree(iocg); return NULL; } return &iocg->pd; } static void ioc_pd_init(struct blkg_policy_data *pd) { struct ioc_gq *iocg = pd_to_iocg(pd); struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); struct ioc *ioc = q_to_ioc(blkg->q); struct ioc_now now; struct blkcg_gq *tblkg; unsigned long flags; ioc_now(ioc, &now); iocg->ioc = ioc; atomic64_set(&iocg->vtime, now.vnow); atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); INIT_LIST_HEAD(&iocg->walk_list); INIT_LIST_HEAD(&iocg->surplus_list); iocg->hweight_active = WEIGHT_ONE; iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->waitq_timer.function = iocg_waitq_timer_fn; iocg->level = blkg->blkcg->css.cgroup->level; for (tblkg = blkg; tblkg; tblkg = tblkg->parent) { struct ioc_gq *tiocg = blkg_to_iocg(tblkg); iocg->ancestors[tiocg->level] = tiocg; } spin_lock_irqsave(&ioc->lock, flags); weight_updated(iocg, &now); spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_pd_free(struct blkg_policy_data *pd) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; unsigned long flags; if (ioc) { spin_lock_irqsave(&ioc->lock, flags); if (!list_empty(&iocg->active_list)) { struct ioc_now now; ioc_now(ioc, &now); propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } WARN_ON_ONCE(!list_empty(&iocg->walk_list)); WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); spin_unlock_irqrestore(&ioc->lock, flags); hrtimer_cancel(&iocg->waitq_timer); } free_percpu(iocg->pcpu_stat); kfree(iocg); } static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; if (!ioc->enabled) return; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( ioc->vtime_base_rate * 10000, VTIME_PER_USEC); seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); } seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); if (blkcg_debug_stats) seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", iocg->last_stat.wait_us, iocg->last_stat.indebt_us, iocg->last_stat.indelay_us); } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc_gq *iocg = pd_to_iocg(pd); if (dname && iocg->cfg_weight) seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); return 0; } static int ioc_weight_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); struct blkg_conf_ctx ctx; struct ioc_now now; struct ioc_gq *iocg; u32 v; int ret; if (!strchr(buf, ':')) { struct blkcg_gq *blkg; if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v)) return -EINVAL; if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) return -EINVAL; spin_lock_irq(&blkcg->lock); iocc->dfl_weight = v * WEIGHT_ONE; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct ioc_gq *iocg = blkg_to_iocg(blkg); if (iocg) { spin_lock(&iocg->ioc->lock); ioc_now(iocg->ioc, &now); weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); } } spin_unlock_irq(&blkcg->lock); return nbytes; } blkg_conf_init(&ctx, buf); ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx); if (ret) goto err; iocg = blkg_to_iocg(ctx.blkg); if (!strncmp(ctx.body, "default", 7)) { v = 0; } else { if (!sscanf(ctx.body, "%u", &v)) goto einval; if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) goto einval; } spin_lock(&iocg->ioc->lock); iocg->cfg_weight = v * WEIGHT_ONE; ioc_now(iocg->ioc, &now); weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); blkg_conf_exit(&ctx); return nbytes; einval: ret = -EINVAL; err: blkg_conf_exit(&ctx); return ret; } static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc *ioc = pd_to_iocg(pd)->ioc; if (!dname) return 0; spin_lock(&ioc->lock); seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n", dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", ioc->params.qos[QOS_RPPM] / 10000, ioc->params.qos[QOS_RPPM] % 10000 / 100, ioc->params.qos[QOS_RLAT], ioc->params.qos[QOS_WPPM] / 10000, ioc->params.qos[QOS_WPPM] % 10000 / 100, ioc->params.qos[QOS_WLAT], ioc->params.qos[QOS_MIN] / 10000, ioc->params.qos[QOS_MIN] % 10000 / 100, ioc->params.qos[QOS_MAX] / 10000, ioc->params.qos[QOS_MAX] % 10000 / 100); spin_unlock(&ioc->lock); return 0; } static int ioc_qos_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static const match_table_t qos_ctrl_tokens = { { QOS_ENABLE, "enable=%u" }, { QOS_CTRL, "ctrl=%s" }, { NR_QOS_CTRL_PARAMS, NULL }, }; static const match_table_t qos_tokens = { { QOS_RPPM, "rpct=%s" }, { QOS_RLAT, "rlat=%u" }, { QOS_WPPM, "wpct=%s" }, { QOS_WLAT, "wlat=%u" }, { QOS_MIN, "min=%s" }, { QOS_MAX, "max=%s" }, { NR_QOS_PARAMS, NULL }, }; static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct blkg_conf_ctx ctx; struct gendisk *disk; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; char *body, *p; int ret; blkg_conf_init(&ctx, input); ret = blkg_conf_open_bdev(&ctx); if (ret) goto err; body = ctx.body; disk = ctx.bdev->bd_disk; if (!queue_is_mq(disk->queue)) { ret = -EOPNOTSUPP; goto err; } ioc = q_to_ioc(disk->queue); if (!ioc) { ret = blk_iocost_init(disk); if (ret) goto err; ioc = q_to_ioc(disk->queue); } blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); memcpy(qos, ioc->params.qos, sizeof(qos)); enable = ioc->enabled; user = ioc->user_qos_params; while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; s64 v; if (!*p) continue; switch (match_token(p, qos_ctrl_tokens, args)) { case QOS_ENABLE: if (match_u64(&args[0], &v)) goto einval; enable = v; continue; case QOS_CTRL: match_strlcpy(buf, &args[0], sizeof(buf)); if (!strcmp(buf, "auto")) user = false; else if (!strcmp(buf, "user")) user = true; else goto einval; continue; } tok = match_token(p, qos_tokens, args); switch (tok) { case QOS_RPPM: case QOS_WPPM: if (match_strlcpy(buf, &args[0], sizeof(buf)) >= sizeof(buf)) goto einval; if (cgroup_parse_float(buf, 2, &v)) goto einval; if (v < 0 || v > 10000) goto einval; qos[tok] = v * 100; break; case QOS_RLAT: case QOS_WLAT: if (match_u64(&args[0], &v)) goto einval; qos[tok] = v; break; case QOS_MIN: case QOS_MAX: if (match_strlcpy(buf, &args[0], sizeof(buf)) >= sizeof(buf)) goto einval; if (cgroup_parse_float(buf, 2, &v)) goto einval; if (v < 0) goto einval; qos[tok] = clamp_t(s64, v * 100, VRATE_MIN_PPM, VRATE_MAX_PPM); break; default: goto einval; } user = true; } if (qos[QOS_MIN] > qos[QOS_MAX]) goto einval; if (enable && !ioc->enabled) { blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; } else if (!enable && ioc->enabled) { blk_stat_disable_accounting(disk->queue); blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; } if (user) { memcpy(ioc->params.qos, qos, sizeof(qos)); ioc->user_qos_params = true; } else { ioc->user_qos_params = false; } ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); if (enable) wbt_disable_default(disk); else wbt_enable_default(disk); blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); blkg_conf_exit(&ctx); return nbytes; einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); ret = -EINVAL; err: blkg_conf_exit(&ctx); return ret; } static u64 ioc_cost_model_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc *ioc = pd_to_iocg(pd)->ioc; u64 *u = ioc->params.i_lcoefs; if (!dname) return 0; spin_lock(&ioc->lock); seq_printf(sf, "%s ctrl=%s model=linear " "rbps=%llu rseqiops=%llu rrandiops=%llu " "wbps=%llu wseqiops=%llu wrandiops=%llu\n", dname, ioc->user_cost_model ? "user" : "auto", u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]); spin_unlock(&ioc->lock); return 0; } static int ioc_cost_model_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static const match_table_t cost_ctrl_tokens = { { COST_CTRL, "ctrl=%s" }, { COST_MODEL, "model=%s" }, { NR_COST_CTRL_PARAMS, NULL }, }; static const match_table_t i_lcoef_tokens = { { I_LCOEF_RBPS, "rbps=%u" }, { I_LCOEF_RSEQIOPS, "rseqiops=%u" }, { I_LCOEF_RRANDIOPS, "rrandiops=%u" }, { I_LCOEF_WBPS, "wbps=%u" }, { I_LCOEF_WSEQIOPS, "wseqiops=%u" }, { I_LCOEF_WRANDIOPS, "wrandiops=%u" }, { NR_I_LCOEFS, NULL }, }; static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct blkg_conf_ctx ctx; struct request_queue *q; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; char *body, *p; int ret; blkg_conf_init(&ctx, input); ret = blkg_conf_open_bdev(&ctx); if (ret) goto err; body = ctx.body; q = bdev_get_queue(ctx.bdev); if (!queue_is_mq(q)) { ret = -EOPNOTSUPP; goto err; } ioc = q_to_ioc(q); if (!ioc) { ret = blk_iocost_init(ctx.bdev->bd_disk); if (ret) goto err; ioc = q_to_ioc(q); } blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); spin_lock_irq(&ioc->lock); memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; u64 v; if (!*p) continue; switch (match_token(p, cost_ctrl_tokens, args)) { case COST_CTRL: match_strlcpy(buf, &args[0], sizeof(buf)); if (!strcmp(buf, "auto")) user = false; else if (!strcmp(buf, "user")) user = true; else goto einval; continue; case COST_MODEL: match_strlcpy(buf, &args[0], sizeof(buf)); if (strcmp(buf, "linear")) goto einval; continue; } tok = match_token(p, i_lcoef_tokens, args); if (tok == NR_I_LCOEFS) goto einval; if (match_u64(&args[0], &v)) goto einval; u[tok] = v; user = true; } if (user) { memcpy(ioc->params.i_lcoefs, u, sizeof(u)); ioc->user_cost_model = true; } else { ioc->user_cost_model = false; } ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); blkg_conf_exit(&ctx); return nbytes; einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); ret = -EINVAL; err: blkg_conf_exit(&ctx); return ret; } static struct cftype ioc_files[] = { { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = ioc_weight_show, .write = ioc_weight_write, }, { .name = "cost.qos", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = ioc_qos_show, .write = ioc_qos_write, }, { .name = "cost.model", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = ioc_cost_model_show, .write = ioc_cost_model_write, }, {} }; static struct blkcg_policy blkcg_policy_iocost = { .dfl_cftypes = ioc_files, .cpd_alloc_fn = ioc_cpd_alloc, .cpd_free_fn = ioc_cpd_free, .pd_alloc_fn = ioc_pd_alloc, .pd_init_fn = ioc_pd_init, .pd_free_fn = ioc_pd_free, .pd_stat_fn = ioc_pd_stat, }; static int __init ioc_init(void) { return blkcg_policy_register(&blkcg_policy_iocost); } static void __exit ioc_exit(void) { blkcg_policy_unregister(&blkcg_policy_iocost); } module_init(ioc_init); module_exit(ioc_exit);
158 90 36 132 252 143 168 686 808 808 809 686 690 690 4 4 1046 582 350 4 253 235 137 89 71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the IP module. * * Version: @(#)ip.h 1.0.2 05/07/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Changes: * Mike McLagan : Routing by source */ #ifndef _IP_H #define _IP_H #include <linux/types.h> #include <linux/ip.h> #include <linux/in.h> #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/sockptr.h> #include <linux/static_key.h> #include <net/inet_sock.h> #include <net/route.h> #include <net/snmp.h> #include <net/flow.h> #include <net/flow_dissector.h> #include <net/netns/hash.h> #include <net/lwtunnel.h> #include <net/inet_dscp.h> #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ extern unsigned int sysctl_fib_sync_mem; extern unsigned int sysctl_fib_sync_mem_min; extern unsigned int sysctl_fib_sync_mem_max; struct sock; struct inet_skb_parm { int iif; struct ip_options opt; /* Compiled IP options */ u16 flags; #define IPSKB_FORWARDED BIT(0) #define IPSKB_XFRM_TUNNEL_SIZE BIT(1) #define IPSKB_XFRM_TRANSFORMED BIT(2) #define IPSKB_FRAG_COMPLETE BIT(3) #define IPSKB_REROUTED BIT(4) #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_L3SLAVE BIT(7) #define IPSKB_NOPOLICY BIT(8) #define IPSKB_MULTIPATH BIT(9) u16 frag_max_size; }; static inline bool ipv4_l3mdev_skb(u16 flags) { return !!(flags & IPSKB_L3SLAVE); } static inline unsigned int ip_hdrlen(const struct sk_buff *skb) { return ip_hdr(skb)->ihl * 4; } struct ipcm_cookie { struct sockcm_cookie sockc; __be32 addr; int oif; struct ip_options_rcu *opt; __u8 protocol; __u8 ttl; __s16 tos; __u16 gso_size; }; static inline void ipcm_init(struct ipcm_cookie *ipcm) { *ipcm = (struct ipcm_cookie) { .tos = -1 }; } static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, const struct inet_sock *inet) { ipcm_init(ipcm); ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); ipcm->sockc.priority = READ_ONCE(inet->sk.sk_priority); ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; ipcm->protocol = inet->inet_num; } #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) #define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb)) /* return enslaved device index if relevant */ static inline int inet_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) return IPCB(skb)->iif; #endif return 0; } /* Special input handler for packets caught by router alert option. They are selected only by protocol field, and then processed likely local ones; but only if someone wants them! Otherwise, router not running rsvpd will kill RSVP. It is user level problem, what it will make with them. I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), but receiver should be enough clever f.e. to forward mtrace requests, sent to multicast group to reach destination designated router. */ struct ip_ra_chain { struct ip_ra_chain __rcu *next; struct sock *sk; union { void (*destructor)(struct sock *); struct sock *saved_sk; }; struct rcu_head rcu; }; /* IP flags. */ #define IP_CE 0x8000 /* Flag: "Congestion" */ #define IP_DF 0x4000 /* Flag: "Don't Fragment" */ #define IP_MF 0x2000 /* Flag: "More Fragments" */ #define IP_OFFSET 0x1FFF /* "Fragment Offset" part */ #define IP_FRAG_TIME (30 * HZ) /* fragment lifetime */ struct msghdr; struct net_device; struct packet_type; struct rtable; struct sockaddr; int igmp_mc_init(void); /* * Functions provided by ip.c */ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt, u8 tos); int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev); int ip_local_deliver(struct sk_buff *skb); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto); int ip_mr_input(struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); struct ip_fraglist_iter { struct sk_buff *frag; struct iphdr *iph; int offset; unsigned int hlen; }; void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, unsigned int hlen, struct ip_fraglist_iter *iter); void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter); static inline struct sk_buff *ip_fraglist_next(struct ip_fraglist_iter *iter) { struct sk_buff *skb = iter->frag; iter->frag = skb->next; skb_mark_not_on_list(skb); return skb; } struct ip_frag_state { bool DF; unsigned int hlen; unsigned int ll_rs; unsigned int mtu; unsigned int left; int offset; int ptr; __be16 not_last_frag; }; void ip_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int ll_rs, unsigned int mtu, bool DF, struct ip_frag_state *state); struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state); void ip_send_check(struct iphdr *ip); int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, __u8 tos); void ip_init(void); int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int len, int protolen, struct ipcm_cookie *ipc, struct rtable **rt, unsigned int flags); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb); struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork); int ip_send_skb(struct net *net, struct sk_buff *skb); int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4); void ip_flush_pending_frames(struct sock *sk); struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, struct inet_cork *cork, unsigned int flags); int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4) { return __ip_make_skb(sk, fl4, &sk->sk_write_queue, &inet_sk(sk)->cork.base); } /* Get the route scope that should be used when sending a packet. */ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet, const struct ipcm_cookie *ipc, const struct msghdr *msg) { if (sock_flag(&inet->sk, SOCK_LOCALROUTE) || msg->msg_flags & MSG_DONTROUTE || (ipc->opt && ipc->opt->opt.is_strictroute)) return RT_SCOPE_LINK; return RT_SCOPE_UNIVERSE; } static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet) { u8 dsfield = ipc->tos != -1 ? ipc->tos : READ_ONCE(inet->tos); return dsfield & INET_DSCP_MASK; } /* datagram.c */ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void ip4_datagram_release_cb(struct sock *sk); struct ip_reply_arg { struct kvec iov[1]; int flags; __wsum csum; int csumoffset; /* u16 offset of csum in iov[0].iov_base */ /* -1 if not needed */ int bound_dev_if; u8 tos; kuid_t uid; }; #define IP_REPLY_ARG_NOSRCCHECK 1 static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) { return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, unsigned int len, u64 transmit_time, u32 txhash); #define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field) #define __IP_INC_STATS(net, field) __SNMP_INC_STATS64((net)->mib.ip_statistics, field) #define IP_ADD_STATS(net, field, val) SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val) #define __IP_ADD_STATS(net, field, val) __SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val) #define IP_UPD_PO_STATS(net, field, val) SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val) #define __IP_UPD_PO_STATS(net, field, val) __SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val) #define NET_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.net_statistics, field) #define __NET_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.net_statistics, field) #define NET_ADD_STATS(net, field, adnd) SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd) #define __NET_ADD_STATS(net, field, adnd) __SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd) static inline u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt) { return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt); } unsigned long snmp_fold_field(void __percpu *mib, int offt); #if BITS_PER_LONG==32 u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, size_t syncp_offset); u64 snmp_fold_field64(void __percpu *mib, int offt, size_t sync_off); #else static inline u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, size_t syncp_offset) { return snmp_get_cpu_field(mib, cpu, offct); } static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_off) { return snmp_fold_field(mib, offt); } #endif #define snmp_get_cpu_field64_batch(buff64, stats_list, mib_statistic, offset) \ { \ int i, c; \ for_each_possible_cpu(c) { \ for (i = 0; stats_list[i].name; i++) \ buff64[i] += snmp_get_cpu_field64( \ mib_statistic, \ c, stats_list[i].entry, \ offset); \ } \ } #define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \ { \ int i, c; \ for_each_possible_cpu(c) { \ for (i = 0; stats_list[i].name; i++) \ buff[i] += snmp_get_cpu_field( \ mib_statistic, \ c, stats_list[i].entry); \ } \ } static inline void inet_get_local_port_range(const struct net *net, int *low, int *high) { u32 range = READ_ONCE(net->ipv4.ip_local_ports.range); *low = range & 0xffff; *high = range >> 16; } bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) { if (!net->ipv4.sysctl_local_reserved_ports) return false; return test_bit(port, net->ipv4.sysctl_local_reserved_ports); } static inline bool sysctl_dev_name_is_allowed(const char *name) { return strcmp(name, "default") != 0 && strcmp(name, "all") != 0; } static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port) { return port < READ_ONCE(net->ipv4.sysctl_ip_prot_sock); } #else static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) { return false; } static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port) { return port < PROT_SOCK; } #endif __be32 inet_current_timestamp(void); /* From inetpeer.c */ extern int inet_peer_threshold; extern int inet_peer_minttl; extern int inet_peer_maxttl; void ipfrag_init(void); void ip_static_sysctl_init(void); #define IP4_REPLY_MARK(net, mark) \ (READ_ONCE((net)->ipv4.sysctl_fwmark_reflect) ? (mark) : 0) static inline bool ip_is_fragment(const struct iphdr *iph) { return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0; } #ifdef CONFIG_INET #include <net/dst.h> /* The function in 2.2 was invalid, producing wrong result for * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */ static inline int ip_decrease_ttl(struct iphdr *iph) { u32 check = (__force u32)iph->check; check += (__force u32)htons(0x0100); iph->check = (__force __sum16)(check + (check>=0xFFFF)); return --iph->ttl; } static inline dscp_t ip4h_dscp(const struct iphdr *ip4h) { return inet_dsfield_to_dscp(ip4h->tos); } static inline int ip_mtu_locked(const struct dst_entry *dst) { const struct rtable *rt = dst_rtable(dst); return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU); } static inline int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst) { u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); return pmtudisc == IP_PMTUDISC_DO || (pmtudisc == IP_PMTUDISC_WANT && !ip_mtu_locked(dst)); } static inline bool ip_sk_accept_pmtu(const struct sock *sk) { u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); return pmtudisc != IP_PMTUDISC_INTERFACE && pmtudisc != IP_PMTUDISC_OMIT; } static inline bool ip_sk_use_pmtu(const struct sock *sk) { return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE; } static inline bool ip_sk_ignore_df(const struct sock *sk) { u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT; } static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { const struct rtable *rt = dst_rtable(dst); struct net *net = dev_net(dst->dev); unsigned int mtu; if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { mtu = rt->rt_pmtu; if (mtu && time_before(jiffies, rt->dst.expires)) goto out; } /* 'forwarding = true' case should always honour route mtu */ mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) goto out; mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } out: mtu = min_t(unsigned int, mtu, IP_MAX_MTU); return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, const struct sk_buff *skb) { unsigned int mtu; if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) { bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); } mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); } struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack); static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics) { if (fib_metrics != &dst_default_metrics && refcount_dec_and_test(&fib_metrics->refcnt)) kfree(fib_metrics); } /* ipv4 and ipv6 both use refcounted metrics if it is not the default */ static inline void ip_dst_init_metrics(struct dst_entry *dst, struct dst_metrics *fib_metrics) { dst_init_metrics(dst, fib_metrics->metrics, true); if (fib_metrics != &dst_default_metrics) { dst->_metrics |= DST_METRICS_REFCOUNTED; refcount_inc(&fib_metrics->refcnt); } } static inline void ip_dst_metrics_put(struct dst_entry *dst) { struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) kfree(p); } void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, struct sock *sk, int segs) { struct iphdr *iph = ip_hdr(skb); /* We had many attacks based on IPID, use the private * generator as much as we can. */ if (sk && inet_sk(sk)->inet_daddr) { int val; /* avoid atomic operations for TCP, * as we hold socket lock at this point. */ if (sk_is_tcp(sk)) { sock_owned_by_me(sk); val = atomic_read(&inet_sk(sk)->inet_id); atomic_set(&inet_sk(sk)->inet_id, val + segs); } else { val = atomic_add_return(segs, &inet_sk(sk)->inet_id); } iph->id = htons(val); return; } if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { iph->id = 0; } else { /* Unfortunately we need the big hammer to get a suitable IPID */ __ip_select_ident(net, iph, segs); } } static inline void ip_select_ident(struct net *net, struct sk_buff *skb, struct sock *sk) { ip_select_ident_segs(net, skb, sk, 1); } static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto) { return csum_tcpudp_nofold(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len, proto, 0); } /* copy IPv4 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v4addrs.src = iph->saddr; * flow->v4addrs.dst = iph->daddr; */ static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow, const struct iphdr *iph) { BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) != offsetof(typeof(flow->addrs), v4addrs.src) + sizeof(flow->addrs.v4addrs.src)); memcpy(&flow->addrs.v4addrs, &iph->addrs, sizeof(flow->addrs.v4addrs)); flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } /* * Map a multicast IP onto multicast MAC for type ethernet. */ static inline void ip_eth_mc_map(__be32 naddr, char *buf) { __u32 addr=ntohl(naddr); buf[0]=0x01; buf[1]=0x00; buf[2]=0x5e; buf[5]=addr&0xFF; addr>>=8; buf[4]=addr&0xFF; addr>>=8; buf[3]=addr&0x7F; } /* * Map a multicast IP onto multicast MAC for type IP-over-InfiniBand. * Leave P_Key as 0 to be filled in by driver. */ static inline void ip_ib_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf) { __u32 addr; unsigned char scope = broadcast[5] & 0xF; buf[0] = 0; /* Reserved */ buf[1] = 0xff; /* Multicast QPN */ buf[2] = 0xff; buf[3] = 0xff; addr = ntohl(naddr); buf[4] = 0xff; buf[5] = 0x10 | scope; /* scope from broadcast address */ buf[6] = 0x40; /* IPv4 signature */ buf[7] = 0x1b; buf[8] = broadcast[8]; /* P_Key */ buf[9] = broadcast[9]; buf[10] = 0; buf[11] = 0; buf[12] = 0; buf[13] = 0; buf[14] = 0; buf[15] = 0; buf[19] = addr & 0xff; addr >>= 8; buf[18] = addr & 0xff; addr >>= 8; buf[17] = addr & 0xff; addr >>= 8; buf[16] = addr & 0x0f; } static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf) { if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) memcpy(buf, broadcast, 4); else memcpy(buf, &naddr, sizeof(naddr)); } #if IS_ENABLED(CONFIG_IPV6) #include <linux/ipv6.h> #endif static __inline__ void inet_reset_saddr(struct sock *sk) { inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); memset(&np->saddr, 0, sizeof(np->saddr)); memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr)); } #endif } #endif static inline unsigned int ipv4_addr_hash(__be32 ip) { return (__force unsigned int) ip; } static inline u32 __ipv4_addr_hash(const __be32 ip, const u32 initval) { return jhash_1word((__force u32)ip, initval); } static inline u32 ipv4_portaddr_hash(const struct net *net, __be32 saddr, unsigned int port) { return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; } bool ip_call_ra_chain(struct sk_buff *skb); /* * Functions provided by ip_fragment.c */ enum ip_defrag_users { IP_DEFRAG_LOCAL_DELIVER, IP_DEFRAG_CALL_RA_CHAIN, IP_DEFRAG_CONNTRACK_IN, __IP_DEFRAG_CONNTRACK_IN_END = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX, IP_DEFRAG_CONNTRACK_OUT, __IP_DEFRAG_CONNTRACK_OUT_END = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX, IP_DEFRAG_CONNTRACK_BRIDGE_IN, __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, IP_DEFRAG_VS_IN, IP_DEFRAG_VS_OUT, IP_DEFRAG_VS_FWD, IP_DEFRAG_AF_PACKET, IP_DEFRAG_MACVLAN, }; /* Return true if the value of 'user' is between 'lower_bond' * and 'upper_bond' inclusively. */ static inline bool ip_defrag_user_in_between(u32 user, enum ip_defrag_users lower_bond, enum ip_defrag_users upper_bond) { return user >= lower_bond && user <= upper_bond; } int ip_defrag(struct net *net, struct sk_buff *skb, u32 user); #ifdef CONFIG_INET struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user); #else static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { return skb; } #endif /* * Functions provided by ip_forward.c */ int ip_forward(struct sk_buff *skb); /* * Functions provided by ip_options.c */ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt); int __ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb, const struct ip_options *sopt); static inline int ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb) { return __ip_options_echo(net, dopt, skb, &IPCB(skb)->opt); } void ip_options_fragment(struct sk_buff *skb); int __ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb, __be32 *info); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb); int ip_options_get(struct net *net, struct ip_options_rcu **optp, sockptr_t data, int optlen); void ip_options_undo(struct ip_options *opt); void ip_forward_options(struct sk_buff *skb); int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev); /* * Functions provided by ip_sockglue.c */ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst); void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6); DECLARE_STATIC_KEY_FALSE(ip4_min_ttl); int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int do_ip_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)); int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, u32 info); static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) { ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0); } bool icmp_global_allow(struct net *net); void icmp_global_consume(struct net *net); #ifdef CONFIG_PROC_FS int ip_misc_proc_init(void); #endif int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family, struct netlink_ext_ack *extack); static inline bool inetdev_valid_mtu(unsigned int mtu) { return likely(mtu >= IPV4_MIN_MTU); } void ip_sock_set_freebind(struct sock *sk); int ip_sock_set_mtu_discover(struct sock *sk, int val); void ip_sock_set_pktinfo(struct sock *sk); void ip_sock_set_recverr(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); void __ip_sock_set_tos(struct sock *sk, int val); #endif /* _IP_H */
2414 2413 2412 2408 2410 2407 2412 2326 765 2400 2406 14 122 2408 755 757 756 755 2402 2414 757 756 5 5 2411 2410 5 2408 2407 2414 8 2410 2414 2409 2402 2413 8 13 13 13 13 12 12 13 2403 2405 2402 2406 8 8 2335 2337 2334 2333 8 8 2333 2 13 14 14 14 2 13 13 13 13 13 13 2334 2337 2411 2406 122 2407 2413 2403 2414 2405 2411 2404 5 2414 2410 2406 2398 2411 2414 103 102 2322 2330 2401 2345 2407 2414 2342 2335 2333 2400 2408 2401 2414 2405 2401 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/irqflags.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/bug.h> #include "printk_ringbuffer.h" #include "internal.h" /** * DOC: printk_ringbuffer overview * * Data Structure * -------------- * The printk_ringbuffer is made up of 3 internal ringbuffers: * * desc_ring * A ring of descriptors and their meta data (such as sequence number, * timestamp, loglevel, etc.) as well as internal state information about * the record and logical positions specifying where in the other * ringbuffer the text strings are located. * * text_data_ring * A ring of data blocks. A data block consists of an unsigned long * integer (ID) that maps to a desc_ring index followed by the text * string of the record. * * The internal state information of a descriptor is the key element to allow * readers and writers to locklessly synchronize access to the data. * * Implementation * -------------- * * Descriptor Ring * ~~~~~~~~~~~~~~~ * The descriptor ring is an array of descriptors. A descriptor contains * essential meta data to track the data of a printk record using * blk_lpos structs pointing to associated text data blocks (see * "Data Rings" below). Each descriptor is assigned an ID that maps * directly to index values of the descriptor array and has a state. The ID * and the state are bitwise combined into a single descriptor field named * @state_var, allowing ID and state to be synchronously and atomically * updated. * * Descriptors have four states: * * reserved * A writer is modifying the record. * * committed * The record and all its data are written. A writer can reopen the * descriptor (transitioning it back to reserved), but in the committed * state the data is consistent. * * finalized * The record and all its data are complete and available for reading. A * writer cannot reopen the descriptor. * * reusable * The record exists, but its text and/or meta data may no longer be * available. * * Querying the @state_var of a record requires providing the ID of the * descriptor to query. This can yield a possible fifth (pseudo) state: * * miss * The descriptor being queried has an unexpected ID. * * The descriptor ring has a @tail_id that contains the ID of the oldest * descriptor and @head_id that contains the ID of the newest descriptor. * * When a new descriptor should be created (and the ring is full), the tail * descriptor is invalidated by first transitioning to the reusable state and * then invalidating all tail data blocks up to and including the data blocks * associated with the tail descriptor (for the text ring). Then * @tail_id is advanced, followed by advancing @head_id. And finally the * @state_var of the new descriptor is initialized to the new ID and reserved * state. * * The @tail_id can only be advanced if the new @tail_id would be in the * committed or reusable queried state. This makes it possible that a valid * sequence number of the tail is always available. * * Descriptor Finalization * ~~~~~~~~~~~~~~~~~~~~~~~ * When a writer calls the commit function prb_commit(), record data is * fully stored and is consistent within the ringbuffer. However, a writer can * reopen that record, claiming exclusive access (as with prb_reserve()), and * modify that record. When finished, the writer must again commit the record. * * In order for a record to be made available to readers (and also become * recyclable for writers), it must be finalized. A finalized record cannot be * reopened and can never become "unfinalized". Record finalization can occur * in three different scenarios: * * 1) A writer can simultaneously commit and finalize its record by calling * prb_final_commit() instead of prb_commit(). * * 2) When a new record is reserved and the previous record has been * committed via prb_commit(), that previous record is automatically * finalized. * * 3) When a record is committed via prb_commit() and a newer record * already exists, the record being committed is automatically finalized. * * Data Ring * ~~~~~~~~~ * The text data ring is a byte array composed of data blocks. Data blocks are * referenced by blk_lpos structs that point to the logical position of the * beginning of a data block and the beginning of the next adjacent data * block. Logical positions are mapped directly to index values of the byte * array ringbuffer. * * Each data block consists of an ID followed by the writer data. The ID is * the identifier of a descriptor that is associated with the data block. A * given data block is considered valid if all of the following conditions * are met: * * 1) The descriptor associated with the data block is in the committed * or finalized queried state. * * 2) The blk_lpos struct within the descriptor associated with the data * block references back to the same data block. * * 3) The data block is within the head/tail logical position range. * * If the writer data of a data block would extend beyond the end of the * byte array, only the ID of the data block is stored at the logical * position and the full data block (ID and writer data) is stored at the * beginning of the byte array. The referencing blk_lpos will point to the * ID before the wrap and the next data block will be at the logical * position adjacent the full data block after the wrap. * * Data rings have a @tail_lpos that points to the beginning of the oldest * data block and a @head_lpos that points to the logical position of the * next (not yet existing) data block. * * When a new data block should be created (and the ring is full), tail data * blocks will first be invalidated by putting their associated descriptors * into the reusable state and then pushing the @tail_lpos forward beyond * them. Then the @head_lpos is pushed forward and is associated with a new * descriptor. If a data block is not valid, the @tail_lpos cannot be * advanced beyond it. * * Info Array * ~~~~~~~~~~ * The general meta data of printk records are stored in printk_info structs, * stored in an array with the same number of elements as the descriptor ring. * Each info corresponds to the descriptor of the same index in the * descriptor ring. Info validity is confirmed by evaluating the corresponding * descriptor before and after loading the info. * * Usage * ----- * Here are some simple examples demonstrating writers and readers. For the * examples a global ringbuffer (test_rb) is available (which is not the * actual ringbuffer used by printk):: * * DEFINE_PRINTKRB(test_rb, 15, 5); * * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of * 1 MiB (2 ^ (15 + 5)) for text data. * * Sample writer code:: * * const char *textstr = "message text"; * struct prb_reserved_entry e; * struct printk_record r; * * // specify how much to allocate * prb_rec_init_wr(&r, strlen(textstr) + 1); * * if (prb_reserve(&e, &test_rb, &r)) { * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); * * r.info->text_len = strlen(textstr); * r.info->ts_nsec = local_clock(); * r.info->caller_id = printk_caller_id(); * * // commit and finalize the record * prb_final_commit(&e); * } * * Note that additional writer functions are available to extend a record * after it has been committed but not yet finalized. This can be done as * long as no new records have been reserved and the caller is the same. * * Sample writer code (record extending):: * * // alternate rest of previous example * * r.info->text_len = strlen(textstr); * r.info->ts_nsec = local_clock(); * r.info->caller_id = printk_caller_id(); * * // commit the record (but do not finalize yet) * prb_commit(&e); * } * * ... * * // specify additional 5 bytes text space to extend * prb_rec_init_wr(&r, 5); * * // try to extend, but only if it does not exceed 32 bytes * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) { * snprintf(&r.text_buf[r.info->text_len], * r.text_buf_size - r.info->text_len, "hello"); * * r.info->text_len += 5; * * // commit and finalize the record * prb_final_commit(&e); * } * * Sample reader code:: * * struct printk_info info; * struct printk_record r; * char text_buf[32]; * u64 seq; * * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf)); * * prb_for_each_record(0, &test_rb, &seq, &r) { * if (info.seq != seq) * pr_warn("lost %llu records\n", info.seq - seq); * * if (info.text_len > r.text_buf_size) { * pr_warn("record %llu text truncated\n", info.seq); * text_buf[r.text_buf_size - 1] = 0; * } * * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec, * &text_buf[0]); * } * * Note that additional less convenient reader functions are available to * allow complex record access. * * ABA Issues * ~~~~~~~~~~ * To help avoid ABA issues, descriptors are referenced by IDs (array index * values combined with tagged bits counting array wraps) and data blocks are * referenced by logical positions (array index values combined with tagged * bits counting array wraps). However, on 32-bit systems the number of * tagged bits is relatively small such that an ABA incident is (at least * theoretically) possible. For example, if 4 million maximally sized (1KiB) * printk messages were to occur in NMI context on a 32-bit system, the * interrupted context would not be able to recognize that the 32-bit integer * completely wrapped and thus represents a different data block than the one * the interrupted context expects. * * To help combat this possibility, additional state checking is performed * (such as using cmpxchg() even though set() would suffice). These extra * checks are commented as such and will hopefully catch any ABA issue that * a 32-bit system might experience. * * Memory Barriers * ~~~~~~~~~~~~~~~ * Multiple memory barriers are used. To simplify proving correctness and * generating litmus tests, lines of code related to memory barriers * (loads, stores, and the associated memory barriers) are labeled:: * * LMM(function:letter) * * Comments reference the labels using only the "function:letter" part. * * The memory barrier pairs and their ordering are: * * desc_reserve:D / desc_reserve:B * push descriptor tail (id), then push descriptor head (id) * * desc_reserve:D / data_push_tail:B * push data tail (lpos), then set new descriptor reserved (state) * * desc_reserve:D / desc_push_tail:C * push descriptor tail (id), then set new descriptor reserved (state) * * desc_reserve:D / prb_first_seq:C * push descriptor tail (id), then set new descriptor reserved (state) * * desc_reserve:F / desc_read:D * set new descriptor id and reserved (state), then allow writer changes * * data_alloc:A (or data_realloc:A) / desc_read:D * set old descriptor reusable (state), then modify new data block area * * data_alloc:A (or data_realloc:A) / data_push_tail:B * push data tail (lpos), then modify new data block area * * _prb_commit:B / desc_read:B * store writer changes, then set new descriptor committed (state) * * desc_reopen_last:A / _prb_commit:B * set descriptor reserved (state), then read descriptor data * * _prb_commit:B / desc_reserve:D * set new descriptor committed (state), then check descriptor head (id) * * data_push_tail:D / data_push_tail:A * set descriptor reusable (state), then push data tail (lpos) * * desc_push_tail:B / desc_reserve:D * set descriptor reusable (state), then push descriptor tail (id) * * desc_update_last_finalized:A / desc_last_finalized_seq:A * store finalized record, then set new highest finalized sequence number */ #define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) #define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1) #define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits) #define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1) /* Determine the data array index from a logical position. */ #define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring)) /* Determine the desc array index from an ID or sequence number. */ #define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring)) /* Determine how many times the data array has wrapped. */ #define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) /* Determine if a logical position refers to a data-less block. */ #define LPOS_DATALESS(lpos) ((lpos) & 1UL) #define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \ LPOS_DATALESS((blk)->next)) /* Get the logical position at index 0 of the current wrap. */ #define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ ((lpos) & ~DATA_SIZE_MASK(data_ring)) /* Get the ID for the same index of the previous wrap as the given ID. */ #define DESC_ID_PREV_WRAP(desc_ring, id) \ DESC_ID((id) - DESCS_COUNT(desc_ring)) /* * A data block: mapped directly to the beginning of the data block area * specified as a logical position within the data ring. * * @id: the ID of the associated descriptor * @data: the writer data * * Note that the size of a data block is only known by its associated * descriptor. */ struct prb_data_block { unsigned long id; char data[]; }; /* * Return the descriptor associated with @n. @n can be either a * descriptor ID or a sequence number. */ static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) { return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; } /* * Return the printk_info associated with @n. @n can be either a * descriptor ID or a sequence number. */ static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n) { return &desc_ring->infos[DESC_INDEX(desc_ring, n)]; } static struct prb_data_block *to_block(struct prb_data_ring *data_ring, unsigned long begin_lpos) { return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)]; } /* * Increase the data size to account for data block meta data plus any * padding so that the adjacent data block is aligned on the ID size. */ static unsigned int to_blk_size(unsigned int size) { struct prb_data_block *db = NULL; size += sizeof(*db); size = ALIGN(size, sizeof(db->id)); return size; } /* * Sanity checker for reserve size. The ringbuffer code assumes that a data * block does not exceed the maximum possible size that could fit within the * ringbuffer. This function provides that basic size check so that the * assumption is safe. */ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) { struct prb_data_block *db = NULL; if (size == 0) return true; /* * Ensure the alignment padded size could possibly fit in the data * array. The largest possible data block must still leave room for * at least the ID of the next block. */ size = to_blk_size(size); if (size > DATA_SIZE(data_ring) - sizeof(db->id)) return false; return true; } /* Query the state of a descriptor. */ static enum desc_state get_desc_state(unsigned long id, unsigned long state_val) { if (id != DESC_ID(state_val)) return desc_miss; return DESC_STATE(state_val); } /* * Get a copy of a specified descriptor and return its queried state. If the * descriptor is in an inconsistent state (miss or reserved), the caller can * only expect the descriptor's @state_var field to be valid. * * The sequence number and caller_id can be optionally retrieved. Like all * non-state_var data, they are only valid if the descriptor is in a * consistent state. */ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, unsigned long id, struct prb_desc *desc_out, u64 *seq_out, u32 *caller_id_out) { struct printk_info *info = to_info(desc_ring, id); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; enum desc_state d_state; unsigned long state_val; /* Check the descriptor state. */ state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ d_state = get_desc_state(id, state_val); if (d_state == desc_miss || d_state == desc_reserved) { /* * The descriptor is in an inconsistent state. Set at least * @state_var so that the caller can see the details of * the inconsistent state. */ goto out; } /* * Guarantee the state is loaded before copying the descriptor * content. This avoids copying obsolete descriptor content that might * not apply to the descriptor state. This pairs with _prb_commit:B. * * Memory barrier involvement: * * If desc_read:A reads from _prb_commit:B, then desc_read:C reads * from _prb_commit:A. * * Relies on: * * WMB from _prb_commit:A to _prb_commit:B * matching * RMB from desc_read:A to desc_read:C */ smp_rmb(); /* LMM(desc_read:B) */ /* * Copy the descriptor data. The data is not valid until the * state has been re-checked. A memcpy() for all of @desc * cannot be used because of the atomic_t @state_var field. */ if (desc_out) { memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ } if (seq_out) *seq_out = info->seq; /* also part of desc_read:C */ if (caller_id_out) *caller_id_out = info->caller_id; /* also part of desc_read:C */ /* * 1. Guarantee the descriptor content is loaded before re-checking * the state. This avoids reading an obsolete descriptor state * that may not apply to the copied content. This pairs with * desc_reserve:F. * * Memory barrier involvement: * * If desc_read:C reads from desc_reserve:G, then desc_read:E * reads from desc_reserve:F. * * Relies on: * * WMB from desc_reserve:F to desc_reserve:G * matching * RMB from desc_read:C to desc_read:E * * 2. Guarantee the record data is loaded before re-checking the * state. This avoids reading an obsolete descriptor state that may * not apply to the copied data. This pairs with data_alloc:A and * data_realloc:A. * * Memory barrier involvement: * * If copy_data:A reads from data_alloc:B, then desc_read:E * reads from desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to data_alloc:B * matching * RMB from desc_read:C to desc_read:E * * Note: desc_make_reusable:A and data_alloc:B can be different * CPUs. However, the data_alloc:B CPU (which performs the * full memory barrier) must have previously seen * desc_make_reusable:A. */ smp_rmb(); /* LMM(desc_read:D) */ /* * The data has been copied. Return the current descriptor state, * which may have changed since the load above. */ state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ d_state = get_desc_state(id, state_val); out: if (desc_out) atomic_long_set(&desc_out->state_var, state_val); return d_state; } /* * Take a specified descriptor out of the finalized state by attempting * the transition from finalized to reusable. Either this context or some * other context will have been successful. */ static void desc_make_reusable(struct prb_desc_ring *desc_ring, unsigned long id) { unsigned long val_finalized = DESC_SV(id, desc_finalized); unsigned long val_reusable = DESC_SV(id, desc_reusable); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; atomic_long_cmpxchg_relaxed(state_var, val_finalized, val_reusable); /* LMM(desc_make_reusable:A) */ } /* * Given the text data ring, put the associated descriptor of each * data block from @lpos_begin until @lpos_end into the reusable state. * * If there is any problem making the associated descriptor reusable, either * the descriptor has not yet been finalized or another writer context has * already pushed the tail lpos past the problematic data block. Regardless, * on error the caller can re-load the tail lpos to determine the situation. */ static bool data_make_reusable(struct printk_ringbuffer *rb, unsigned long lpos_begin, unsigned long lpos_end, unsigned long *lpos_out) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_desc_ring *desc_ring = &rb->desc_ring; struct prb_data_block *blk; enum desc_state d_state; struct prb_desc desc; struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos; unsigned long id; /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { blk = to_block(data_ring, lpos_begin); /* * Load the block ID from the data block. This is a data race * against a writer that may have newly reserved this data * area. If the loaded value matches a valid descriptor ID, * the blk_lpos of that descriptor will be checked to make * sure it points back to this data block. If the check fails, * the data area has been recycled by another writer. */ id = blk->id; /* LMM(data_make_reusable:A) */ d_state = desc_read(desc_ring, id, &desc, NULL, NULL); /* LMM(data_make_reusable:B) */ switch (d_state) { case desc_miss: case desc_reserved: case desc_committed: return false; case desc_finalized: /* * This data block is invalid if the descriptor * does not point back to it. */ if (blk_lpos->begin != lpos_begin) return false; desc_make_reusable(desc_ring, id); break; case desc_reusable: /* * This data block is invalid if the descriptor * does not point back to it. */ if (blk_lpos->begin != lpos_begin) return false; break; } /* Advance @lpos_begin to the next data block. */ lpos_begin = blk_lpos->next; } *lpos_out = lpos_begin; return true; } /* * Advance the data ring tail to at least @lpos. This function puts * descriptors into the reusable state if the tail is pushed beyond * their associated data block. */ static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos) { struct prb_data_ring *data_ring = &rb->text_data_ring; unsigned long tail_lpos_new; unsigned long tail_lpos; unsigned long next_lpos; /* If @lpos is from a data-less block, there is nothing to do. */ if (LPOS_DATALESS(lpos)) return true; /* * Any descriptor states that have transitioned to reusable due to the * data tail being pushed to this loaded value will be visible to this * CPU. This pairs with data_push_tail:D. * * Memory barrier involvement: * * If data_push_tail:A reads from data_push_tail:D, then this CPU can * see desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to data_push_tail:D * matches * READFROM from data_push_tail:D to data_push_tail:A * thus * READFROM from desc_make_reusable:A to this CPU */ tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */ /* * Loop until the tail lpos is at or beyond @lpos. This condition * may already be satisfied, resulting in no full memory barrier * from data_push_tail:D being performed. However, since this CPU * sees the new tail lpos, any descriptor states that transitioned to * the reusable state must already be visible. */ while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { /* * Make all descriptors reusable that are associated with * data blocks before @lpos. */ if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) { /* * 1. Guarantee the block ID loaded in * data_make_reusable() is performed before * reloading the tail lpos. The failed * data_make_reusable() may be due to a newly * recycled data area causing the tail lpos to * have been previously pushed. This pairs with * data_alloc:A and data_realloc:A. * * Memory barrier involvement: * * If data_make_reusable:A reads from data_alloc:B, * then data_push_tail:C reads from * data_push_tail:D. * * Relies on: * * MB from data_push_tail:D to data_alloc:B * matching * RMB from data_make_reusable:A to * data_push_tail:C * * Note: data_push_tail:D and data_alloc:B can be * different CPUs. However, the data_alloc:B * CPU (which performs the full memory * barrier) must have previously seen * data_push_tail:D. * * 2. Guarantee the descriptor state loaded in * data_make_reusable() is performed before * reloading the tail lpos. The failed * data_make_reusable() may be due to a newly * recycled descriptor causing the tail lpos to * have been previously pushed. This pairs with * desc_reserve:D. * * Memory barrier involvement: * * If data_make_reusable:B reads from * desc_reserve:F, then data_push_tail:C reads * from data_push_tail:D. * * Relies on: * * MB from data_push_tail:D to desc_reserve:F * matching * RMB from data_make_reusable:B to * data_push_tail:C * * Note: data_push_tail:D and desc_reserve:F can * be different CPUs. However, the * desc_reserve:F CPU (which performs the * full memory barrier) must have previously * seen data_push_tail:D. */ smp_rmb(); /* LMM(data_push_tail:B) */ tail_lpos_new = atomic_long_read(&data_ring->tail_lpos ); /* LMM(data_push_tail:C) */ if (tail_lpos_new == tail_lpos) return false; /* Another CPU pushed the tail. Try again. */ tail_lpos = tail_lpos_new; continue; } /* * Guarantee any descriptor states that have transitioned to * reusable are stored before pushing the tail lpos. A full * memory barrier is needed since other CPUs may have made * the descriptor states reusable. This pairs with * data_push_tail:A. */ if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos, next_lpos)) { /* LMM(data_push_tail:D) */ break; } } return true; } /* * Advance the desc ring tail. This function advances the tail by one * descriptor, thus invalidating the oldest descriptor. Before advancing * the tail, the tail descriptor is made reusable and all data blocks up to * and including the descriptor's data block are invalidated (i.e. the data * ring tail is pushed past the data block of the descriptor being made * reusable). */ static bool desc_push_tail(struct printk_ringbuffer *rb, unsigned long tail_id) { struct prb_desc_ring *desc_ring = &rb->desc_ring; enum desc_state d_state; struct prb_desc desc; d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL); switch (d_state) { case desc_miss: /* * If the ID is exactly 1 wrap behind the expected, it is * in the process of being reserved by another writer and * must be considered reserved. */ if (DESC_ID(atomic_long_read(&desc.state_var)) == DESC_ID_PREV_WRAP(desc_ring, tail_id)) { return false; } /* * The ID has changed. Another writer must have pushed the * tail and recycled the descriptor already. Success is * returned because the caller is only interested in the * specified tail being pushed, which it was. */ return true; case desc_reserved: case desc_committed: return false; case desc_finalized: desc_make_reusable(desc_ring, tail_id); break; case desc_reusable: break; } /* * Data blocks must be invalidated before their associated * descriptor can be made available for recycling. Invalidating * them later is not possible because there is no way to trust * data blocks once their associated descriptor is gone. */ if (!data_push_tail(rb, desc.text_blk_lpos.next)) return false; /* * Check the next descriptor after @tail_id before pushing the tail * to it because the tail must always be in a finalized or reusable * state. The implementation of prb_first_seq() relies on this. * * A successful read implies that the next descriptor is less than or * equal to @head_id so there is no risk of pushing the tail past the * head. */ d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc, NULL, NULL); /* LMM(desc_push_tail:A) */ if (d_state == desc_finalized || d_state == desc_reusable) { /* * Guarantee any descriptor states that have transitioned to * reusable are stored before pushing the tail ID. This allows * verifying the recycled descriptor state. A full memory * barrier is needed since other CPUs may have made the * descriptor states reusable. This pairs with desc_reserve:D. */ atomic_long_cmpxchg(&desc_ring->tail_id, tail_id, DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */ } else { /* * Guarantee the last state load from desc_read() is before * reloading @tail_id in order to see a new tail ID in the * case that the descriptor has been recycled. This pairs * with desc_reserve:D. * * Memory barrier involvement: * * If desc_push_tail:A reads from desc_reserve:F, then * desc_push_tail:D reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:F * matching * RMB from desc_push_tail:A to desc_push_tail:D * * Note: desc_push_tail:B and desc_reserve:F can be different * CPUs. However, the desc_reserve:F CPU (which performs * the full memory barrier) must have previously seen * desc_push_tail:B. */ smp_rmb(); /* LMM(desc_push_tail:C) */ /* * Re-check the tail ID. The descriptor following @tail_id is * not in an allowed tail state. But if the tail has since * been moved by another CPU, then it does not matter. */ if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */ return false; } return true; } /* Reserve a new descriptor, invalidating the oldest if necessary. */ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long prev_state_val; unsigned long id_prev_wrap; struct prb_desc *desc; unsigned long head_id; unsigned long id; head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ do { id = DESC_ID(head_id + 1); id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); /* * Guarantee the head ID is read before reading the tail ID. * Since the tail ID is updated before the head ID, this * guarantees that @id_prev_wrap is never ahead of the tail * ID. This pairs with desc_reserve:D. * * Memory barrier involvement: * * If desc_reserve:A reads from desc_reserve:D, then * desc_reserve:C reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:D * matching * RMB from desc_reserve:A to desc_reserve:C * * Note: desc_push_tail:B and desc_reserve:D can be different * CPUs. However, the desc_reserve:D CPU (which performs * the full memory barrier) must have previously seen * desc_push_tail:B. */ smp_rmb(); /* LMM(desc_reserve:B) */ if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id )) { /* LMM(desc_reserve:C) */ /* * Make space for the new descriptor by * advancing the tail. */ if (!desc_push_tail(rb, id_prev_wrap)) return false; } /* * 1. Guarantee the tail ID is read before validating the * recycled descriptor state. A read memory barrier is * sufficient for this. This pairs with desc_push_tail:B. * * Memory barrier involvement: * * If desc_reserve:C reads from desc_push_tail:B, then * desc_reserve:E reads from desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to desc_push_tail:B * matching * RMB from desc_reserve:C to desc_reserve:E * * Note: desc_make_reusable:A and desc_push_tail:B can be * different CPUs. However, the desc_push_tail:B CPU * (which performs the full memory barrier) must have * previously seen desc_make_reusable:A. * * 2. Guarantee the tail ID is stored before storing the head * ID. This pairs with desc_reserve:B. * * 3. Guarantee any data ring tail changes are stored before * recycling the descriptor. Data ring tail changes can * happen via desc_push_tail()->data_push_tail(). A full * memory barrier is needed since another CPU may have * pushed the data ring tails. This pairs with * data_push_tail:B. * * 4. Guarantee a new tail ID is stored before recycling the * descriptor. A full memory barrier is needed since * another CPU may have pushed the tail ID. This pairs * with desc_push_tail:C and this also pairs with * prb_first_seq:C. * * 5. Guarantee the head ID is stored before trying to * finalize the previous descriptor. This pairs with * _prb_commit:B. */ } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, id)); /* LMM(desc_reserve:D) */ desc = to_desc(desc_ring, id); /* * If the descriptor has been recycled, verify the old state val. * See "ABA Issues" about why this verification is performed. */ prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ if (prev_state_val && get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) { WARN_ON_ONCE(1); return false; } /* * Assign the descriptor a new ID and set its state to reserved. * See "ABA Issues" about why cmpxchg() instead of set() is used. * * Guarantee the new descriptor ID and state is stored before making * any other changes. A write memory barrier is sufficient for this. * This pairs with desc_read:D. */ if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */ WARN_ON_ONCE(1); return false; } /* Now data in @desc can be modified: LMM(desc_reserve:G) */ *id_out = id; return true; } /* Determine the end of a data block. */ static unsigned long get_next_lpos(struct prb_data_ring *data_ring, unsigned long lpos, unsigned int size) { unsigned long begin_lpos; unsigned long next_lpos; begin_lpos = lpos; next_lpos = lpos + size; /* First check if the data block does not wrap. */ if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) return next_lpos; /* Wrapping data blocks store their data at the beginning. */ return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size); } /* * Allocate a new data block, invalidating the oldest data block(s) * if necessary. This function also associates the data block with * a specified descriptor. */ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, struct prb_data_blk_lpos *blk_lpos, unsigned long id) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_data_block *blk; unsigned long begin_lpos; unsigned long next_lpos; if (size == 0) { /* * Data blocks are not created for empty lines. Instead, the * reader will recognize these special lpos values and handle * it appropriately. */ blk_lpos->begin = EMPTY_LINE_LPOS; blk_lpos->next = EMPTY_LINE_LPOS; return NULL; } size = to_blk_size(size); begin_lpos = atomic_long_read(&data_ring->head_lpos); do { next_lpos = get_next_lpos(data_ring, begin_lpos, size); if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { /* Failed to allocate, specify a data-less block. */ blk_lpos->begin = FAILED_LPOS; blk_lpos->next = FAILED_LPOS; return NULL; } /* * 1. Guarantee any descriptor states that have transitioned * to reusable are stored before modifying the newly * allocated data area. A full memory barrier is needed * since other CPUs may have made the descriptor states * reusable. See data_push_tail:A about why the reusable * states are visible. This pairs with desc_read:D. * * 2. Guarantee any updated tail lpos is stored before * modifying the newly allocated data area. Another CPU may * be in data_make_reusable() and is reading a block ID * from this area. data_make_reusable() can handle reading * a garbage block ID value, but then it must be able to * load a new tail lpos. A full memory barrier is needed * since other CPUs may have updated the tail lpos. This * pairs with data_push_tail:B. */ } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos, next_lpos)); /* LMM(data_alloc:A) */ blk = to_block(data_ring, begin_lpos); blk->id = id; /* LMM(data_alloc:B) */ if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { /* Wrapping data blocks store their data at the beginning. */ blk = to_block(data_ring, 0); /* * Store the ID on the wrapped block for consistency. * The printk_ringbuffer does not actually use it. */ blk->id = id; } blk_lpos->begin = begin_lpos; blk_lpos->next = next_lpos; return &blk->data[0]; } /* * Try to resize an existing data block associated with the descriptor * specified by @id. If the resized data block should become wrapped, it * copies the old data to the new data block. If @size yields a data block * with the same or less size, the data block is left as is. * * Fail if this is not the last allocated data block or if there is not * enough space or it is not possible make enough space. * * Return a pointer to the beginning of the entire data buffer or NULL on * failure. */ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, struct prb_data_blk_lpos *blk_lpos, unsigned long id) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_data_block *blk; unsigned long head_lpos; unsigned long next_lpos; bool wrapped; /* Reallocation only works if @blk_lpos is the newest data block. */ head_lpos = atomic_long_read(&data_ring->head_lpos); if (head_lpos != blk_lpos->next) return NULL; /* Keep track if @blk_lpos was a wrapping data block. */ wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); size = to_blk_size(size); next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); /* If the data block does not increase, there is nothing to do. */ if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { if (wrapped) blk = to_block(data_ring, 0); else blk = to_block(data_ring, blk_lpos->begin); return &blk->data[0]; } if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) return NULL; /* The memory barrier involvement is the same as data_alloc:A. */ if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, next_lpos)) { /* LMM(data_realloc:A) */ return NULL; } blk = to_block(data_ring, blk_lpos->begin); if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { struct prb_data_block *old_blk = blk; /* Wrapping data blocks store their data at the beginning. */ blk = to_block(data_ring, 0); /* * Store the ID on the wrapped block for consistency. * The printk_ringbuffer does not actually use it. */ blk->id = id; if (!wrapped) { /* * Since the allocated space is now in the newly * created wrapping data block, copy the content * from the old data block. */ memcpy(&blk->data[0], &old_blk->data[0], (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id)); } } blk_lpos->next = next_lpos; return &blk->data[0]; } /* Return the number of bytes used by a data block. */ static unsigned int space_used(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos) { /* Data-less blocks take no space. */ if (BLK_DATALESS(blk_lpos)) return 0; if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { /* Data block does not wrap. */ return (DATA_INDEX(data_ring, blk_lpos->next) - DATA_INDEX(data_ring, blk_lpos->begin)); } /* * For wrapping data blocks, the trailing (wasted) space is * also counted. */ return (DATA_INDEX(data_ring, blk_lpos->next) + DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); } /* * Given @blk_lpos, return a pointer to the writer data from the data block * and calculate the size of the data part. A NULL pointer is returned if * @blk_lpos specifies values that could never be legal. * * This function (used by readers) performs strict validation on the lpos * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is * triggered if an internal error is detected. */ static const char *get_data(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos, unsigned int *data_size) { struct prb_data_block *db; /* Data-less data block description. */ if (BLK_DATALESS(blk_lpos)) { /* * Records that are just empty lines are also valid, even * though they do not have a data block. For such records * explicitly return empty string data to signify success. */ if (blk_lpos->begin == EMPTY_LINE_LPOS && blk_lpos->next == EMPTY_LINE_LPOS) { *data_size = 0; return ""; } /* Data lost, invalid, or otherwise unavailable. */ return NULL; } /* Regular data block: @begin less than @next and in same wrap. */ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && blk_lpos->begin < blk_lpos->next) { db = to_block(data_ring, blk_lpos->begin); *data_size = blk_lpos->next - blk_lpos->begin; /* Wrapping data block: @begin is one wrap behind @next. */ } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == DATA_WRAPS(data_ring, blk_lpos->next)) { db = to_block(data_ring, 0); *data_size = DATA_INDEX(data_ring, blk_lpos->next); /* Illegal block description. */ } else { WARN_ON_ONCE(1); return NULL; } /* A valid data block will always be aligned to the ID size. */ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { return NULL; } /* A valid data block will always have at least an ID. */ if (WARN_ON_ONCE(*data_size < sizeof(db->id))) return NULL; /* Subtract block ID space from size to reflect data size. */ *data_size -= sizeof(db->id); return &db->data[0]; } /* * Attempt to transition the newest descriptor from committed back to reserved * so that the record can be modified by a writer again. This is only possible * if the descriptor is not yet finalized and the provided @caller_id matches. */ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, u32 caller_id, unsigned long *id_out) { unsigned long prev_state_val; enum desc_state d_state; struct prb_desc desc; struct prb_desc *d; unsigned long id; u32 cid; id = atomic_long_read(&desc_ring->head_id); /* * To reduce unnecessarily reopening, first check if the descriptor * state and caller ID are correct. */ d_state = desc_read(desc_ring, id, &desc, NULL, &cid); if (d_state != desc_committed || cid != caller_id) return NULL; d = to_desc(desc_ring, id); prev_state_val = DESC_SV(id, desc_committed); /* * Guarantee the reserved state is stored before reading any * record data. A full memory barrier is needed because @state_var * modification is followed by reading. This pairs with _prb_commit:B. * * Memory barrier involvement: * * If desc_reopen_last:A reads from _prb_commit:B, then * prb_reserve_in_last:A reads from _prb_commit:A. * * Relies on: * * WMB from _prb_commit:A to _prb_commit:B * matching * MB If desc_reopen_last:A to prb_reserve_in_last:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */ return NULL; } *id_out = id; return d; } /** * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer * used by the newest record. * * @e: The entry structure to setup. * @rb: The ringbuffer to re-reserve and extend data in. * @r: The record structure to allocate buffers for. * @caller_id: The caller ID of the caller (reserving writer). * @max_size: Fail if the extended size would be greater than this. * * This is the public function available to writers to re-reserve and extend * data. * * The writer specifies the text size to extend (not the new total size) by * setting the @text_buf_size field of @r. To ensure proper initialization * of @r, prb_rec_init_wr() should be used. * * This function will fail if @caller_id does not match the caller ID of the * newest record. In that case the caller must reserve new data using * prb_reserve(). * * Context: Any context. Disables local interrupts on success. * Return: true if text data could be extended, otherwise false. * * On success: * * - @r->text_buf points to the beginning of the entire text buffer. * * - @r->text_buf_size is set to the new total size of the buffer. * * - @r->info is not touched so that @r->info->text_len could be used * to append the text. * * - prb_record_text_space() can be used on @e to query the new * actually used space. * * Important: All @r->info fields will already be set with the current values * for the record. I.e. @r->info->text_len will be less than * @text_buf_size. Writers can use @r->info->text_len to know * where concatenation begins and writers should update * @r->info->text_len after concatenating. */ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r, u32 caller_id, unsigned int max_size) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info; unsigned int data_size; struct prb_desc *d; unsigned long id; local_irq_save(e->irqflags); /* Transition the newest descriptor back to the reserved state. */ d = desc_reopen_last(desc_ring, caller_id, &id); if (!d) { local_irq_restore(e->irqflags); goto fail_reopen; } /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ info = to_info(desc_ring, id); /* * Set the @e fields here so that prb_commit() can be used if * anything fails from now on. */ e->rb = rb; e->id = id; /* * desc_reopen_last() checked the caller_id, but there was no * exclusive access at that point. The descriptor may have * changed since then. */ if (caller_id != info->caller_id) goto fail; if (BLK_DATALESS(&d->text_blk_lpos)) { if (WARN_ON_ONCE(info->text_len != 0)) { pr_warn_once("wrong text_len value (%hu, expecting 0)\n", info->text_len); info->text_len = 0; } if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; if (r->text_buf_size > max_size) goto fail; r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); } else { if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size)) goto fail; /* * Increase the buffer size to include the original size. If * the meta data (@text_len) is not sane, use the full data * block size. */ if (WARN_ON_ONCE(info->text_len > data_size)) { pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", info->text_len, data_size); info->text_len = data_size; } r->text_buf_size += info->text_len; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; if (r->text_buf_size > max_size) goto fail; r->text_buf = data_realloc(rb, r->text_buf_size, &d->text_blk_lpos, id); } if (r->text_buf_size && !r->text_buf) goto fail; r->info = info; e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); return true; fail: prb_commit(e); /* prb_commit() re-enabled interrupts. */ fail_reopen: /* Make it clear to the caller that the re-reserve failed. */ memset(r, 0, sizeof(*r)); return false; } /* * @last_finalized_seq value guarantees that all records up to and including * this sequence number are finalized and can be read. The only exception are * too old records which have already been overwritten. * * It is also guaranteed that @last_finalized_seq only increases. * * Be aware that finalized records following non-finalized records are not * reported because they are not yet available to the reader. For example, * a new record stored via printk() will not be available to a printer if * it follows a record that has not been finalized yet. However, once that * non-finalized record becomes finalized, @last_finalized_seq will be * appropriately updated and the full set of finalized records will be * available to the printer. And since each printk() caller will either * directly print or trigger deferred printing of all available unprinted * records, all printk() messages will get printed. */ static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long ulseq; /* * Guarantee the sequence number is loaded before loading the * associated record in order to guarantee that the record can be * seen by this CPU. This pairs with desc_update_last_finalized:A. */ ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq ); /* LMM(desc_last_finalized_seq:A) */ return __ulseq_to_u64seq(rb, ulseq); } static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, struct printk_record *r, unsigned int *line_count); /* * Check if there are records directly following @last_finalized_seq that are * finalized. If so, update @last_finalized_seq to the latest of these * records. It is not allowed to skip over records that are not yet finalized. */ static void desc_update_last_finalized(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; u64 old_seq = desc_last_finalized_seq(rb); unsigned long oldval; unsigned long newval; u64 finalized_seq; u64 try_seq; try_again: finalized_seq = old_seq; try_seq = finalized_seq + 1; /* Try to find later finalized records. */ while (_prb_read_valid(rb, &try_seq, NULL, NULL)) { finalized_seq = try_seq; try_seq++; } /* No update needed if no later finalized record was found. */ if (finalized_seq == old_seq) return; oldval = __u64seq_to_ulseq(old_seq); newval = __u64seq_to_ulseq(finalized_seq); /* * Set the sequence number of a later finalized record that has been * seen. * * Guarantee the record data is visible to other CPUs before storing * its sequence number. This pairs with desc_last_finalized_seq:A. * * Memory barrier involvement: * * If desc_last_finalized_seq:A reads from * desc_update_last_finalized:A, then desc_read:A reads from * _prb_commit:B. * * Relies on: * * RELEASE from _prb_commit:B to desc_update_last_finalized:A * matching * ACQUIRE from desc_last_finalized_seq:A to desc_read:A * * Note: _prb_commit:B and desc_update_last_finalized:A can be * different CPUs. However, the desc_update_last_finalized:A * CPU (which performs the release) must have previously seen * _prb_commit:B. */ if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq, &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */ old_seq = __ulseq_to_u64seq(rb, oldval); goto try_again; } } /* * Attempt to finalize a specified descriptor. If this fails, the descriptor * is either already final or it will finalize itself when the writer commits. */ static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long prev_state_val = DESC_SV(id, desc_committed); struct prb_desc *d = to_desc(desc_ring, id); if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val, DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */ desc_update_last_finalized(rb); } } /** * prb_reserve() - Reserve space in the ringbuffer. * * @e: The entry structure to setup. * @rb: The ringbuffer to reserve data in. * @r: The record structure to allocate buffers for. * * This is the public function available to writers to reserve data. * * The writer specifies the text size to reserve by setting the * @text_buf_size field of @r. To ensure proper initialization of @r, * prb_rec_init_wr() should be used. * * Context: Any context. Disables local interrupts on success. * Return: true if at least text data could be allocated, otherwise false. * * On success, the fields @info and @text_buf of @r will be set by this * function and should be filled in by the writer before committing. Also * on success, prb_record_text_space() can be used on @e to query the actual * space used for the text data block. * * Important: @info->text_len needs to be set correctly by the writer in * order for data to be readable and/or extended. Its value * is initialized to 0. */ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info; struct prb_desc *d; unsigned long id; u64 seq; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; /* * Descriptors in the reserved state act as blockers to all further * reservations once the desc_ring has fully wrapped. Disable * interrupts during the reserve/commit window in order to minimize * the likelihood of this happening. */ local_irq_save(e->irqflags); if (!desc_reserve(rb, &id)) { /* Descriptor reservation failures are tracked. */ atomic_long_inc(&rb->fail); local_irq_restore(e->irqflags); goto fail; } d = to_desc(desc_ring, id); info = to_info(desc_ring, id); /* * All @info fields (except @seq) are cleared and must be filled in * by the writer. Save @seq before clearing because it is used to * determine the new sequence number. */ seq = info->seq; memset(info, 0, sizeof(*info)); /* * Set the @e fields here so that prb_commit() can be used if * text data allocation fails. */ e->rb = rb; e->id = id; /* * Initialize the sequence number if it has "never been set". * Otherwise just increment it by a full wrap. * * @seq is considered "never been set" if it has a value of 0, * _except_ for @infos[0], which was specially setup by the ringbuffer * initializer and therefore is always considered as set. * * See the "Bootstrap" comment block in printk_ringbuffer.h for * details about how the initializer bootstraps the descriptors. */ if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) info->seq = DESC_INDEX(desc_ring, id); else info->seq = seq + DESCS_COUNT(desc_ring); /* * New data is about to be reserved. Once that happens, previous * descriptors are no longer able to be extended. Finalize the * previous descriptor now so that it can be made available to * readers. (For seq==0 there is no previous descriptor.) */ if (info->seq > 0) desc_make_final(rb, DESC_ID(id - 1)); r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); /* If text data allocation fails, a data-less record is committed. */ if (r->text_buf_size && !r->text_buf) { prb_commit(e); /* prb_commit() re-enabled interrupts. */ goto fail; } r->info = info; /* Record full text space used by record. */ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); return true; fail: /* Make it clear to the caller that the reserve failed. */ memset(r, 0, sizeof(*r)); return false; } /* Commit the data (possibly finalizing it) and restore interrupts. */ static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; struct prb_desc *d = to_desc(desc_ring, e->id); unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); /* Now the writer has finished all writing: LMM(_prb_commit:A) */ /* * Set the descriptor as committed. See "ABA Issues" about why * cmpxchg() instead of set() is used. * * 1 Guarantee all record data is stored before the descriptor state * is stored as committed. A write memory barrier is sufficient * for this. This pairs with desc_read:B and desc_reopen_last:A. * * 2. Guarantee the descriptor state is stored as committed before * re-checking the head ID in order to possibly finalize this * descriptor. This pairs with desc_reserve:D. * * Memory barrier involvement: * * If prb_commit:A reads from desc_reserve:D, then * desc_make_final:A reads from _prb_commit:B. * * Relies on: * * MB _prb_commit:B to prb_commit:A * matching * MB desc_reserve:D to desc_make_final:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */ WARN_ON_ONCE(1); } /* Restore interrupts, the reserve/commit window is finished. */ local_irq_restore(e->irqflags); } /** * prb_commit() - Commit (previously reserved) data to the ringbuffer. * * @e: The entry containing the reserved data information. * * This is the public function available to writers to commit data. * * Note that the data is not yet available to readers until it is finalized. * Finalizing happens automatically when space for the next record is * reserved. * * See prb_final_commit() for a version of this function that finalizes * immediately. * * Context: Any context. Enables local interrupts. */ void prb_commit(struct prb_reserved_entry *e) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; unsigned long head_id; _prb_commit(e, desc_committed); /* * If this descriptor is no longer the head (i.e. a new record has * been allocated), extending the data for this record is no longer * allowed and therefore it must be finalized. */ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ if (head_id != e->id) desc_make_final(e->rb, e->id); } /** * prb_final_commit() - Commit and finalize (previously reserved) data to * the ringbuffer. * * @e: The entry containing the reserved data information. * * This is the public function available to writers to commit+finalize data. * * By finalizing, the data is made immediately available to readers. * * This function should only be used if there are no intentions of extending * this data using prb_reserve_in_last(). * * Context: Any context. Enables local interrupts. */ void prb_final_commit(struct prb_reserved_entry *e) { _prb_commit(e, desc_finalized); desc_update_last_finalized(e->rb); } /* * Count the number of lines in provided text. All text has at least 1 line * (even if @text_size is 0). Each '\n' processed is counted as an additional * line. */ static unsigned int count_lines(const char *text, unsigned int text_size) { unsigned int next_size = text_size; unsigned int line_count = 1; const char *next = text; while (next_size) { next = memchr(next, '\n', next_size); if (!next) break; line_count++; next++; next_size = text_size - (next - text); } return line_count; } /* * Given @blk_lpos, copy an expected @len of data into the provided buffer. * If @line_count is provided, count the number of lines in the data. * * This function (used by readers) performs strict validation on the data * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is * triggered if an internal error is detected. */ static bool copy_data(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf, unsigned int buf_size, unsigned int *line_count) { unsigned int data_size; const char *data; /* Caller might not want any data. */ if ((!buf || !buf_size) && !line_count) return true; data = get_data(data_ring, blk_lpos, &data_size); if (!data) return false; /* * Actual cannot be less than expected. It can be more than expected * because of the trailing alignment padding. * * Note that invalid @len values can occur because the caller loads * the value during an allowed data race. */ if (data_size < (unsigned int)len) return false; /* Caller interested in the line count? */ if (line_count) *line_count = count_lines(data, len); /* Caller interested in the data content? */ if (!buf || !buf_size) return true; data_size = min_t(unsigned int, buf_size, len); memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ return true; } /* * This is an extended version of desc_read(). It gets a copy of a specified * descriptor. However, it also verifies that the record is finalized and has * the sequence number @seq. On success, 0 is returned. * * Error return values: * -EINVAL: A finalized record with sequence number @seq does not exist. * -ENOENT: A finalized record with sequence number @seq exists, but its data * is not available. This is a valid record, so readers should * continue with the next record. */ static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, unsigned long id, u64 seq, struct prb_desc *desc_out) { struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; enum desc_state d_state; u64 s; d_state = desc_read(desc_ring, id, desc_out, &s, NULL); /* * An unexpected @id (desc_miss) or @seq mismatch means the record * does not exist. A descriptor in the reserved or committed state * means the record does not yet exist for the reader. */ if (d_state == desc_miss || d_state == desc_reserved || d_state == desc_committed || s != seq) { return -EINVAL; } /* * A descriptor in the reusable state may no longer have its data * available; report it as existing but with lost data. Or the record * may actually be a record with lost data. */ if (d_state == desc_reusable || (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) { return -ENOENT; } return 0; } /* * Copy the ringbuffer data from the record with @seq to the provided * @r buffer. On success, 0 is returned. * * See desc_read_finalized_seq() for error return values. */ static int prb_read(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r, unsigned int *line_count) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info = to_info(desc_ring, seq); struct prb_desc *rdesc = to_desc(desc_ring, seq); atomic_long_t *state_var = &rdesc->state_var; struct prb_desc desc; unsigned long id; int err; /* Extract the ID, used to specify the descriptor to read. */ id = DESC_ID(atomic_long_read(state_var)); /* Get a local copy of the correct descriptor (if available). */ err = desc_read_finalized_seq(desc_ring, id, seq, &desc); /* * If @r is NULL, the caller is only interested in the availability * of the record. */ if (err || !r) return err; /* If requested, copy meta data. */ if (r->info) memcpy(r->info, info, sizeof(*(r->info))); /* Copy text data. If it fails, this is a data-less record. */ if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len, r->text_buf, r->text_buf_size, line_count)) { return -ENOENT; } /* Ensure the record is still finalized and has the same @seq. */ return desc_read_finalized_seq(desc_ring, id, seq, &desc); } /* Get the sequence number of the tail descriptor. */ u64 prb_first_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; enum desc_state d_state; struct prb_desc desc; unsigned long id; u64 seq; for (;;) { id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */ /* * This loop will not be infinite because the tail is * _always_ in the finalized or reusable state. */ if (d_state == desc_finalized || d_state == desc_reusable) break; /* * Guarantee the last state load from desc_read() is before * reloading @tail_id in order to see a new tail in the case * that the descriptor has been recycled. This pairs with * desc_reserve:D. * * Memory barrier involvement: * * If prb_first_seq:B reads from desc_reserve:F, then * prb_first_seq:A reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:F * matching * RMB prb_first_seq:B to prb_first_seq:A */ smp_rmb(); /* LMM(prb_first_seq:C) */ } return seq; } /** * prb_next_reserve_seq() - Get the sequence number after the most recently * reserved record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what sequence * number will be assigned to the next reserved record. * * Note that depending on the situation, this value can be equal to or * higher than the sequence number returned by prb_next_seq(). * * Context: Any context. * Return: The sequence number that will be assigned to the next record * reserved. */ u64 prb_next_reserve_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long last_finalized_id; atomic_long_t *state_var; u64 last_finalized_seq; unsigned long head_id; struct prb_desc desc; unsigned long diff; struct prb_desc *d; int err; /* * It may not be possible to read a sequence number for @head_id. * So the ID of @last_finailzed_seq is used to calculate what the * sequence number of @head_id will be. */ try_again: last_finalized_seq = desc_last_finalized_seq(rb); /* * @head_id is loaded after @last_finalized_seq to ensure that * it points to the record with @last_finalized_seq or newer. * * Memory barrier involvement: * * If desc_last_finalized_seq:A reads from * desc_update_last_finalized:A, then * prb_next_reserve_seq:A reads from desc_reserve:D. * * Relies on: * * RELEASE from desc_reserve:D to desc_update_last_finalized:A * matching * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A * * Note: desc_reserve:D and desc_update_last_finalized:A can be * different CPUs. However, the desc_update_last_finalized:A CPU * (which performs the release) must have previously seen * desc_read:C, which implies desc_reserve:D can be seen. */ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */ d = to_desc(desc_ring, last_finalized_seq); state_var = &d->state_var; /* Extract the ID, used to specify the descriptor to read. */ last_finalized_id = DESC_ID(atomic_long_read(state_var)); /* Ensure @last_finalized_id is correct. */ err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc); if (err == -EINVAL) { if (last_finalized_seq == 0) { /* * No record has been finalized or even reserved yet. * * The @head_id is initialized such that the first * increment will yield the first record (seq=0). * Handle it separately to avoid a negative @diff * below. */ if (head_id == DESC0_ID(desc_ring->count_bits)) return 0; /* * One or more descriptors are already reserved. Use * the descriptor ID of the first one (@seq=0) for * the @diff below. */ last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1; } else { /* Record must have been overwritten. Try again. */ goto try_again; } } /* Diff of known descriptor IDs to compute related sequence numbers. */ diff = head_id - last_finalized_id; /* * @head_id points to the most recently reserved record, but this * function returns the sequence number that will be assigned to the * next (not yet reserved) record. Thus +1 is needed. */ return (last_finalized_seq + diff + 1); } /* * Non-blocking read of a record. * * On success @seq is updated to the record that was read and (if provided) * @r and @line_count will contain the read/calculated data. * * On failure @seq is updated to a record that is not yet available to the * reader, but it will be the next record available to the reader. * * Note: When the current CPU is in panic, this function will skip over any * non-existent/non-finalized records in order to allow the panic CPU * to print any and all records that have been finalized. */ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, struct printk_record *r, unsigned int *line_count) { u64 tail_seq; int err; while ((err = prb_read(rb, *seq, r, line_count))) { tail_seq = prb_first_seq(rb); if (*seq < tail_seq) { /* * Behind the tail. Catch up and try again. This * can happen for -ENOENT and -EINVAL cases. */ *seq = tail_seq; } else if (err == -ENOENT) { /* Record exists, but the data was lost. Skip. */ (*seq)++; } else { /* * Non-existent/non-finalized record. Must stop. * * For panic situations it cannot be expected that * non-finalized records will become finalized. But * there may be other finalized records beyond that * need to be printed for a panic situation. If this * is the panic CPU, skip this * non-existent/non-finalized record unless it is * at or beyond the head, in which case it is not * possible to continue. * * Note that new messages printed on panic CPU are * finalized when we are here. The only exception * might be the last message without trailing newline. * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb))) (*seq)++; else return false; } } return true; } /** * prb_read_valid() - Non-blocking read of a requested record or (if gone) * the next available record. * * @rb: The ringbuffer to read from. * @seq: The sequence number of the record to read. * @r: A record data buffer to store the read record to. * * This is the public function available to readers to read a record. * * The reader provides the @info and @text_buf buffers of @r to be * filled in. Any of the buffer pointers can be set to NULL if the reader * is not interested in that data. To ensure proper initialization of @r, * prb_rec_init_rd() should be used. * * Context: Any context. * Return: true if a record was read, otherwise false. * * On success, the reader must check r->info.seq to see which record was * actually read. This allows the reader to detect dropped records. * * Failure means @seq refers to a record not yet available to the reader. */ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r) { return _prb_read_valid(rb, &seq, r, NULL); } /** * prb_read_valid_info() - Non-blocking read of meta data for a requested * record or (if gone) the next available record. * * @rb: The ringbuffer to read from. * @seq: The sequence number of the record to read. * @info: A buffer to store the read record meta data to. * @line_count: A buffer to store the number of lines in the record text. * * This is the public function available to readers to read only the * meta data of a record. * * The reader provides the @info, @line_count buffers to be filled in. * Either of the buffer pointers can be set to NULL if the reader is not * interested in that data. * * Context: Any context. * Return: true if a record's meta data was read, otherwise false. * * On success, the reader must check info->seq to see which record meta data * was actually read. This allows the reader to detect dropped records. * * Failure means @seq refers to a record not yet available to the reader. */ bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, struct printk_info *info, unsigned int *line_count) { struct printk_record r; prb_rec_init_rd(&r, info, NULL, 0); return _prb_read_valid(rb, &seq, &r, line_count); } /** * prb_first_valid_seq() - Get the sequence number of the oldest available * record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what the * first/oldest valid sequence number is. * * This provides readers a starting point to begin iterating the ringbuffer. * * Context: Any context. * Return: The sequence number of the first/oldest record or, if the * ringbuffer is empty, 0 is returned. */ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) { u64 seq = 0; if (!_prb_read_valid(rb, &seq, NULL, NULL)) return 0; return seq; } /** * prb_next_seq() - Get the sequence number after the last available record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what the next * newest sequence number available to readers will be. * * This provides readers a sequence number to jump to if all currently * available records should be skipped. It is guaranteed that all records * previous to the returned value have been finalized and are (or were) * available to the reader. * * Context: Any context. * Return: The sequence number of the next newest (not yet available) record * for readers. */ u64 prb_next_seq(struct printk_ringbuffer *rb) { u64 seq; seq = desc_last_finalized_seq(rb); /* * Begin searching after the last finalized record. * * On 0, the search must begin at 0 because of hack#2 * of the bootstrapping phase it is not known if a * record at index 0 exists. */ if (seq != 0) seq++; /* * The information about the last finalized @seq might be inaccurate. * Search forward to find the current one. */ while (_prb_read_valid(rb, &seq, NULL, NULL)) seq++; return seq; } /** * prb_init() - Initialize a ringbuffer to use provided external buffers. * * @rb: The ringbuffer to initialize. * @text_buf: The data buffer for text data. * @textbits: The size of @text_buf as a power-of-2 value. * @descs: The descriptor buffer for ringbuffer records. * @descbits: The count of @descs items as a power-of-2 value. * @infos: The printk_info buffer for ringbuffer records. * * This is the public function available to writers to setup a ringbuffer * during runtime using provided buffers. * * This must match the initialization of DEFINE_PRINTKRB(). * * Context: Any context. */ void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int textbits, struct prb_desc *descs, unsigned int descbits, struct printk_info *infos) { memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0])); rb->desc_ring.count_bits = descbits; rb->desc_ring.descs = descs; rb->desc_ring.infos = infos; atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.last_finalized_seq, 0); rb->text_data_ring.size_bits = textbits; rb->text_data_ring.data = text_buf; atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); atomic_long_set(&rb->fail, 0); atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; infos[0].seq = -(u64)_DESCS_COUNT(descbits); infos[_DESCS_COUNT(descbits) - 1].seq = 0; } /** * prb_record_text_space() - Query the full actual used ringbuffer space for * the text data of a reserved entry. * * @e: The successfully reserved entry to query. * * This is the public function available to writers to see how much actual * space is used in the ringbuffer to store the text data of the specified * entry. * * This function is only valid if @e has been successfully reserved using * prb_reserve(). * * Context: Any context. * Return: The size in bytes used by the text data of the associated record. */ unsigned int prb_record_text_space(struct prb_reserved_entry *e) { return e->text_space; }
31 31 98 285 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions implement sctp stream message interleaving, mostly * including I-DATA and I-FORWARD-TSN chunks process. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <net/busy_poll.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/ulpevent.h> #include <linux/sctp.h> static struct sctp_chunk *sctp_make_idatafrag_empty( const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_idatahdr dp; memset(&dp, 0, sizeof(dp)); dp.stream = htons(sinfo->sinfo_stream); if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } static void sctp_chunk_assign_mid(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; __u32 cfsn = 0; __u16 sid; if (chunk->has_mid) return; sid = sctp_chunk_stream_no(chunk); stream = &chunk->asoc->stream; list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) { struct sctp_idatahdr *hdr; __u32 mid; lchunk->has_mid = 1; hdr = lchunk->subh.idata_hdr; if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) hdr->ppid = lchunk->sinfo.sinfo_ppid; else hdr->fsn = htonl(cfsn++); if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_uo_next(stream, out, sid) : sctp_mid_uo_peek(stream, out, sid); } else { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_next(stream, out, sid) : sctp_mid_peek(stream, out, sid); } hdr->mid = htonl(mid); } } static bool sctp_validate_data(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u16 sid, ssn; if (chunk->chunk_hdr->type != SCTP_CID_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); ssn = ntohs(chunk->subh.data_hdr->ssn); return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)); } static bool sctp_validate_idata(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u32 mid; __u16 sid; if (chunk->chunk_hdr->type != SCTP_CID_I_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); mid = ntohl(chunk->subh.idata_hdr->mid); return !MID_lt(mid, sctp_mid_peek(stream, in, sid)); } static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->reasm); if (!pos) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) { loc = pos; break; } if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); else __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream || cevent->mid != sin->mid) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (cevent->mid == sin->mid) { pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode && event->mid == sin->mid && event->fsn == sin->fsn) retval = sctp_intl_retrieve_partial(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled(ulpq, event); return retval; } static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } cevent = (struct sctp_ulpevent *)pos->cb; if (event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } if (event->stream > cevent->stream) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > event->stream) { loc = pos; break; } if (cevent->stream == event->stream && MID_lt(event->mid, cevent->mid)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); else __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event)); } static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head *event_list; struct sctp_stream *stream; struct sk_buff *pos, *tmp; __u16 sid = event->stream; stream = &ulpq->asoc->stream; event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev; sctp_skb_for_each(pos, &ulpq->lobby, tmp) { struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > sid) break; if (cevent->stream < sid) continue; if (cevent->mid != sctp_mid_peek(stream, in, sid)) break; sctp_mid_next(stream, in, sid); __skb_unlink(pos, &ulpq->lobby); __skb_queue_tail(event_list, pos); } } static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_stream *stream; __u16 sid; stream = &ulpq->asoc->stream; sid = event->stream; if (event->mid != sctp_mid_peek(stream, in, sid)) { sctp_intl_store_ordered(ulpq, event); return NULL; } sctp_mid_next(stream, in, sid); sctp_intl_retrieve_ordered(ulpq, event); return event; } static int sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); struct sctp_ulpevent *event; struct sk_buff *skb; skb = __skb_peek(skb_list); event = sctp_skb2event(skb); if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || !sctp_ulpevent_is_notification(event))) goto out_free; if (!sctp_ulpevent_is_notification(event)) { sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; skb_queue_splice_tail_init(skb_list, &sk->sk_receive_queue); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; out_free: sctp_queue_purge_ulpevents(skb_list); return 0; } static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos; pos = skb_peek_tail(&ulpq->reasm_uo); if (!pos) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } skb_queue_walk(&ulpq->reasm_uo, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) break; if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) break; } __skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, sin->mid_uo)) continue; if (MID_lt(sin->mid_uo, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode_uo = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (!sin->pd_mode_uo) { sin->mid_uo = cevent->mid; pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, pd_first, pd_last); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm_uo(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode_uo && event->mid == sin->mid_uo && event->fsn == sin->fsn_uo) retval = sctp_intl_retrieve_partial_uo(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled_uo(ulpq, event); return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode_uo) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; sin->mid_uo = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid_uo && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } return retval; } static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; int event_eor = 0; event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); if (!event) return -ENOMEM; event->mid = ntohl(chunk->subh.idata_hdr->mid); if (event->msg_flags & SCTP_DATA_FIRST_FRAG) event->ppid = chunk->subh.idata_hdr->ppid; else event->fsn = ntohl(chunk->subh.idata_hdr->fsn); if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { event = sctp_intl_reasm(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); if (event->msg_flags & MSG_EOR) event = sctp_intl_order(ulpq, event); } } else { event = sctp_intl_reasm_uo(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); } } if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; sctp_enqueue_event(ulpq, &temp); } return event_eor; } static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; if (cevent->mid == csin->mid) { first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; } break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } return retval; } static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; if (!skb_queue_empty(&ulpq->reasm)) { do { event = sctp_intl_retrieve_first(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } if (!skb_queue_empty(&ulpq->reasm_uo)) { do { event = sctp_intl_retrieve_first_uo(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } } static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc = ulpq->asoc; __u32 freed = 0; __u16 needed; needed = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_idata_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo, needed); } if (freed >= needed && sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) sctp_intl_start_pd(ulpq, gfp); } static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u16 flags, gfp_t gfp) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_ulpevent *ev = NULL; if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, SCTP_PARTIAL_DELIVERY_EVENT)) return; ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, sid, mid, flags, gfp); if (ev) { struct sctp_sock *sp = sctp_sk(sk); __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } } } static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) { struct sctp_stream *stream = &ulpq->asoc->stream; struct sctp_ulpevent *cevent, *event = NULL; struct sk_buff_head *lobby = &ulpq->lobby; struct sk_buff *pos, *tmp; struct sk_buff_head temp; __u16 csid; __u32 cmid; skb_queue_head_init(&temp); sctp_skb_for_each(pos, lobby, tmp) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid > sid) break; if (csid < sid) continue; if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid))) break; __skb_unlink(pos, lobby); if (!event) event = sctp_skb2event(pos); __skb_queue_tail(&temp, pos); } if (!event && pos != (struct sk_buff *)lobby) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) { sctp_mid_next(stream, in, csid); __skb_unlink(pos, lobby); __skb_queue_tail(&temp, pos); event = sctp_skb2event(pos); } } if (event) { sctp_intl_retrieve_ordered(ulpq, event); sctp_enqueue_event(ulpq, &temp); } } static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_stream *stream = &ulpq->asoc->stream; __u16 sid; for (sid = 0; sid < stream->incnt; sid++) { struct sctp_stream_in *sin = SCTP_SI(stream, sid); __u32 mid; if (sin->pd_mode_uo) { sin->pd_mode_uo = 0; mid = sin->mid_uo; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp); } if (sin->pd_mode) { sin->pd_mode = 0; mid = sin->mid; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp); sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } } /* intl abort pd happens only when all data needs to be cleaned */ sctp_ulpq_flush(ulpq); } static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist, int nskips, __be16 stream, __u8 flags) { int i; for (i = 0; i < nskips; i++) if (skiplist[i].stream == stream && skiplist[i].flags == flags) return i; return i; } #define SCTP_FTSN_U_BIT 0x1 static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_ifwdtsn_skip ftsn_skip_arr[10]; struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; struct list_head *lchunk, *temp; int nskips = 0, skip_pos; struct sctp_chunk *chunk; __u32 tsn; if (!asoc->peer.prsctp_capable) return; if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) asoc->adv_peer_ack_point = ctsn; list_for_each_safe(lchunk, temp, &q->abandoned) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(chunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); sctp_chunk_free(chunk); } else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) { __be16 sid = chunk->subh.idata_hdr->stream; __be32 mid = chunk->subh.idata_hdr->mid; __u8 flags = 0; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) flags |= SCTP_FTSN_U_BIT; asoc->adv_peer_ack_point = tsn; skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, sid, flags); ftsn_skip_arr[skip_pos].stream = sid; ftsn_skip_arr[skip_pos].reserved = 0; ftsn_skip_arr[skip_pos].flags = flags; ftsn_skip_arr[skip_pos].mid = mid; if (skip_pos == nskips) nskips++; if (nskips == 10) break; } else { break; } } if (asoc->adv_peer_ack_point > ctsn) ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point, nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } #define _sctp_walk_ifwdtsn(pos, chunk, end) \ for (pos = (void *)(chunk->subh.ifwdtsn_hdr + 1); \ (void *)pos <= (void *)(chunk->subh.ifwdtsn_hdr + 1) + (end) - \ sizeof(struct sctp_ifwdtsn_skip); pos++) #define sctp_walk_ifwdtsn(pos, ch) \ _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ sizeof(struct sctp_ifwdtsn_chunk)) static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_fwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static bool sctp_validate_iftsn(struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_ifwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_ulpq_reasm_flushtsn(ulpq, ftsn); /* Abort any in progress partial delivery. */ sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { struct sk_buff *pos, *tmp; skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm); sctp_ulpevent_free(event); } } skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm_uo); sctp_ulpevent_free(event); } } } static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_intl_reasm_flushtsn(ulpq, ftsn); /* abort only when it's for all data */ if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map)) sctp_intl_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; /* Walk through all the skipped SSNs */ sctp_walk_fwdtsn(skip, chunk) sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); } static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u8 flags) { struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid); struct sctp_stream *stream = &ulpq->asoc->stream; if (flags & SCTP_FTSN_U_BIT) { if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) { sin->pd_mode_uo = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, GFP_ATOMIC); } return; } if (MID_lt(mid, sctp_mid_peek(stream, in, sid))) return; if (sin->pd_mode) { sin->pd_mode = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC); } sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; /* Walk through all the skipped MIDs and abort stream pd if possible */ sctp_walk_ifwdtsn(skip, chunk) sctp_intl_skip(ulpq, ntohs(skip->stream), ntohl(skip->mid), skip->flags); } static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_ulpq_tail_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_0 = { .data_chunk_len = sizeof(struct sctp_data_chunk), .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), /* DATA process functions */ .make_datafrag = sctp_make_datafrag_empty, .assign_number = sctp_chunk_assign_ssn, .validate_data = sctp_validate_data, .ulpevent_data = sctp_ulpq_tail_data, .enqueue_event = do_ulpq_tail_event, .renege_events = sctp_ulpq_renege, .start_pd = sctp_ulpq_partial_delivery, .abort_pd = sctp_ulpq_abort_pd, /* FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_fwdtsn, .validate_ftsn = sctp_validate_fwdtsn, .report_ftsn = sctp_report_fwdtsn, .handle_ftsn = sctp_handle_fwdtsn, }; static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_enqueue_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_1 = { .data_chunk_len = sizeof(struct sctp_idata_chunk), .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), /* I-DATA process functions */ .make_datafrag = sctp_make_idatafrag_empty, .assign_number = sctp_chunk_assign_mid, .validate_data = sctp_validate_idata, .ulpevent_data = sctp_ulpevent_idata, .enqueue_event = do_sctp_enqueue_event, .renege_events = sctp_renege_events, .start_pd = sctp_intl_start_pd, .abort_pd = sctp_intl_abort_pd, /* I-FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_iftsn, .validate_ftsn = sctp_validate_iftsn, .report_ftsn = sctp_report_iftsn, .handle_ftsn = sctp_handle_iftsn, }; void sctp_stream_interleave_init(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); stream->si = asoc->peer.intl_capable ? &sctp_stream_interleave_1 : &sctp_stream_interleave_0; }
2319 2326 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 // SPDX-License-Identifier: GPL-2.0 /* * Detect hard and soft lockups on a system * * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. * * Note: Most of this code is borrowed heavily from the original softlockup * detector, so thanks to Ingo for the initial implementation. * Some chunks also taken from the old x86-specific nmi watchdog code, thanks * to those contributors as well. */ #define pr_fmt(fmt) "watchdog: " fmt #include <linux/cpu.h> #include <linux/init.h> #include <linux/irq.h> #include <linux/irqdesc.h> #include <linux/kernel_stat.h> #include <linux/kvm_para.h> #include <linux/math64.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/nmi.h> #include <linux/stop_machine.h> #include <linux/sysctl.h> #include <linux/tick.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/isolation.h> #include <asm/irq_regs.h> static DEFINE_MUTEX(watchdog_mutex); #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64) # define WATCHDOG_HARDLOCKUP_DEFAULT 1 #else # define WATCHDOG_HARDLOCKUP_DEFAULT 0 #endif #define NUM_SAMPLE_PERIODS 5 unsigned long __read_mostly watchdog_enabled; int __read_mostly watchdog_user_enabled = 1; static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; static int __read_mostly watchdog_softlockup_user_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly watchdog_hardlockup_available; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #ifdef CONFIG_HARDLOCKUP_DETECTOR # ifdef CONFIG_SMP int __read_mostly sysctl_hardlockup_all_cpu_backtrace; # endif /* CONFIG_SMP */ /* * Should we panic when a soft-lockup or hard-lockup occurs: */ unsigned int __read_mostly hardlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these * cases this function can be called to disable hard lockup detection. This * function should only be executed once by the boot processor before the * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ void __init hardlockup_detector_disable(void) { watchdog_hardlockup_user_enabled = 0; } static int __init hardlockup_panic_setup(char *str) { next: if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_hardlockup_user_enabled = 0; else if (!strncmp(str, "1", 1)) watchdog_hardlockup_user_enabled = 1; else if (!strncmp(str, "r", 1)) hardlockup_config_perf_event(str + 1); while (*(str++)) { if (*str == ',') { str++; goto next; } } return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); #endif /* CONFIG_HARDLOCKUP_DETECTOR */ #if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER) static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched); static unsigned long hard_lockup_nmi_warn; notrace void arch_touch_nmi_watchdog(void) { /* * Using __raw here because some code paths have * preemption enabled. If preemption is enabled * then interrupts should be enabled too, in which * case we shouldn't have to worry about the watchdog * going off. */ raw_cpu_write(watchdog_hardlockup_touched, true); } EXPORT_SYMBOL(arch_touch_nmi_watchdog); void watchdog_hardlockup_touch_cpu(unsigned int cpu) { per_cpu(watchdog_hardlockup_touched, cpu) = true; } static bool is_hardlockup(unsigned int cpu) { int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) return true; /* * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is * written/read by a single CPU. */ per_cpu(hrtimer_interrupts_saved, cpu) = hrint; return false; } static void watchdog_hardlockup_kick(void) { int new_interrupts; new_interrupts = atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts)); watchdog_buddy_check_hardlockup(new_interrupts); } void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { if (per_cpu(watchdog_hardlockup_touched, cpu)) { per_cpu(watchdog_hardlockup_touched, cpu) = false; return; } /* * Check for a hardlockup by making sure the CPU's timer * interrupt is incrementing. The timer interrupt should have * fired multiple times before we overflow'd. If it hasn't * then this is a good indication the cpu is stuck */ if (is_hardlockup(cpu)) { unsigned int this_cpu = smp_processor_id(); unsigned long flags; /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) return; /* * Prevent multiple hard-lockup reports if one cpu is already * engaged in dumping all cpu back traces. */ if (sysctl_hardlockup_all_cpu_backtrace) { if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) return; } /* * NOTE: we call printk_cpu_sync_get_irqsave() after printing * the lockup message. While it would be nice to serialize * that printout, we really want to make sure that if some * other CPU somehow locked up while holding the lock associated * with printk_cpu_sync_get_irqsave() that we can still at least * get the message about the lockup out. */ pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu); printk_cpu_sync_get_irqsave(flags); print_modules(); print_irqtrace_events(current); if (cpu == this_cpu) { if (regs) show_regs(regs); else dump_stack(); printk_cpu_sync_put_irqrestore(flags); } else { printk_cpu_sync_put_irqrestore(flags); trigger_single_cpu_backtrace(cpu); } if (sysctl_hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); if (!hardlockup_panic) clear_bit_unlock(0, &hard_lockup_nmi_warn); } if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); per_cpu(watchdog_hardlockup_warned, cpu) = true; } else { per_cpu(watchdog_hardlockup_warned, cpu) = false; } } #else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ static inline void watchdog_hardlockup_kick(void) { } #endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ /* * These functions can be overridden based on the configured hardlockdup detector. * * watchdog_hardlockup_enable/disable can be implemented to start and stop when * softlockup watchdog start and stop. The detector must select the * SOFTLOCKUP_DETECTOR Kconfig. */ void __weak watchdog_hardlockup_enable(unsigned int cpu) { } void __weak watchdog_hardlockup_disable(unsigned int cpu) { } /* * Watchdog-detector specific API. * * Return 0 when hardlockup watchdog is available, negative value otherwise. * Note that the negative value means that a delayed probe might * succeed later. */ int __weak __init watchdog_hardlockup_probe(void) { return -ENODEV; } /** * watchdog_hardlockup_stop - Stop the watchdog for reconfiguration * * The reconfiguration steps are: * watchdog_hardlockup_stop(); * update_variables(); * watchdog_hardlockup_start(); */ void __weak watchdog_hardlockup_stop(void) { } /** * watchdog_hardlockup_start - Start the watchdog after reconfiguration * * Counterpart to watchdog_hardlockup_stop(). * * The following variables have been updated in update_variables() and * contain the currently valid configuration: * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask */ void __weak watchdog_hardlockup_start(void) { } /** * lockup_detector_update_enable - Update the sysctl enable bit * * Caller needs to make sure that the hard watchdogs are off, so this * can't race with watchdog_hardlockup_disable(). */ static void lockup_detector_update_enable(void) { watchdog_enabled = 0; if (!watchdog_user_enabled) return; if (watchdog_hardlockup_available && watchdog_hardlockup_user_enabled) watchdog_enabled |= WATCHDOG_HARDLOCKUP_ENABLED; if (watchdog_softlockup_user_enabled) watchdog_enabled |= WATCHDOG_SOFTOCKUP_ENABLED; } #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* * Delay the soflockup report when running a known slow code. * It does _not_ affect the timestamp of the last successdul reschedule. */ #define SOFTLOCKUP_DELAY_REPORT ULONG_MAX #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; /* Timestamp taken after the last successful reschedule. */ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); /* Timestamp of the last softlockup report. */ static DEFINE_PER_CPU(unsigned long, watchdog_report_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static unsigned long soft_lockup_nmi_warn; static int __init softlockup_panic_setup(char *str) { softlockup_panic = simple_strtoul(str, NULL, 0); return 1; } __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { watchdog_user_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); static int __init nosoftlockup_setup(char *str) { watchdog_softlockup_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); static int __init watchdog_thresh_setup(char *str) { get_option(&str, &watchdog_thresh); return 1; } __setup("watchdog_thresh=", watchdog_thresh_setup); static void __lockup_detector_cleanup(void); #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM enum stats_per_group { STATS_SYSTEM, STATS_SOFTIRQ, STATS_HARDIRQ, STATS_IDLE, NUM_STATS_PER_GROUP, }; static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = { CPUTIME_SYSTEM, CPUTIME_SOFTIRQ, CPUTIME_IRQ, CPUTIME_IDLE, }; static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]); static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]); static DEFINE_PER_CPU(u8, cpustat_tail); /* * We don't need nanosecond resolution. A granularity of 16ms is * sufficient for our precision, allowing us to use u16 to store * cpustats, which will roll over roughly every ~1000 seconds. * 2^24 ~= 16 * 10^6 */ static u16 get_16bit_precision(u64 data_ns) { return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */ } static void update_cpustat(void) { int i; u8 util; u16 old_stat, new_stat; struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; u8 tail = __this_cpu_read(cpustat_tail); u16 sample_period_16 = get_16bit_precision(sample_period); kcpustat_cpu_fetch(&kcpustat, smp_processor_id()); for (i = 0; i < NUM_STATS_PER_GROUP; i++) { old_stat = __this_cpu_read(cpustat_old[i]); new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); __this_cpu_write(cpustat_util[tail][i], util); __this_cpu_write(cpustat_old[i], new_stat); } __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS); } static void print_cpustat(void) { int i, group; u8 tail = __this_cpu_read(cpustat_tail); u64 sample_period_second = sample_period; do_div(sample_period_second, NSEC_PER_SEC); /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n", smp_processor_id(), sample_period_second); for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { group = (tail + i) % NUM_SAMPLE_PERIODS; printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t" "%3u%% hardirq,\t%3u%% idle\n", i + 1, __this_cpu_read(cpustat_util[group][STATS_SYSTEM]), __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]), __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]), __this_cpu_read(cpustat_util[group][STATS_IDLE])); } } #define HARDIRQ_PERCENT_THRESH 50 #define NUM_HARDIRQ_REPORT 5 struct irq_counts { int irq; u32 counts; }; static DEFINE_PER_CPU(bool, snapshot_taken); /* Tabulate the most frequent interrupts. */ static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank) { int i; struct irq_counts new_count = {irq, counts}; for (i = 0; i < rank; i++) { if (counts > irq_counts[i].counts) swap(new_count, irq_counts[i]); } } /* * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period, * then the cause of softlockup might be interrupt storm. In this case, it * would be useful to start interrupt counting. */ static bool need_counting_irqs(void) { u8 util; int tail = __this_cpu_read(cpustat_tail); tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT; util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]); return util > HARDIRQ_PERCENT_THRESH; } static void start_counting_irqs(void) { if (!__this_cpu_read(snapshot_taken)) { kstat_snapshot_irqs(); __this_cpu_write(snapshot_taken, true); } } static void stop_counting_irqs(void) { __this_cpu_write(snapshot_taken, false); } static void print_irq_counts(void) { unsigned int i, count; struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = { {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0} }; if (__this_cpu_read(snapshot_taken)) { for_each_active_irq(i) { count = kstat_get_irq_since_snapshot(i); tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT); } /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n", smp_processor_id(), HARDIRQ_PERCENT_THRESH); for (i = 0; i < NUM_HARDIRQ_REPORT; i++) { if (irq_counts_sorted[i].irq == -1) break; printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n", i + 1, irq_counts_sorted[i].counts, irq_counts_sorted[i].irq); } /* * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last * sample_period, then we suspect the interrupt storm might be subsiding. */ if (!need_counting_irqs()) stop_counting_irqs(); } } static void report_cpu_status(void) { print_cpustat(); print_irq_counts(); } #else static inline void update_cpustat(void) { } static inline void report_cpu_status(void) { } static inline bool need_counting_irqs(void) { return false; } static inline void start_counting_irqs(void) { } static inline void stop_counting_irqs(void) { } #endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- * lockups can have false positives under extreme conditions. So we generally * want a higher threshold for soft lockups than for hard lockups. So we couple * the thresholds with a factor: we make the soft threshold twice the amount of * time the hard threshold is. */ static int get_softlockup_thresh(void) { return watchdog_thresh * 2; } /* * Returns seconds, approximately. We don't need nanosecond * resolution, and we don't need to waste time with a big divide when * 2^30ns == 1.074s. */ static unsigned long get_timestamp(void) { return running_clock() >> 30LL; /* 2^30 ~= 10^9 */ } static void set_sample_period(void) { /* * convert watchdog_thresh from seconds to ns * the divide by 5 is to give hrtimer several chances (two * or three with the current relation between the soft * and hard thresholds) to increment before the * hardlockup detector generates a warning */ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS); watchdog_update_hrtimer_threshold(sample_period); } static void update_report_ts(void) { __this_cpu_write(watchdog_report_ts, get_timestamp()); } /* Commands for resetting the watchdog */ static void update_touch_ts(void) { __this_cpu_write(watchdog_touch_ts, get_timestamp()); update_report_ts(); } /** * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls * * Call when the scheduler may have stalled for legitimate reasons * preventing the watchdog task from executing - e.g. the scheduler * entering idle state. This should only be used for scheduler events. * Use touch_softlockup_watchdog() for everything else. */ notrace void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's watchdog * report period gets restarted here, so use the raw_ operation. */ raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } notrace void touch_softlockup_watchdog(void) { touch_softlockup_watchdog_sched(); wq_watchdog_touch(raw_smp_processor_id()); } EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) { int cpu; /* * watchdog_mutex cannpt be taken here, as this might be called * from (soft)interrupt context, so the access to * watchdog_allowed_cpumask might race with a concurrent update. * * The watchdog time stamp can race against a concurrent real * update as well, the only side effect might be a cycle delay for * the softlockup check. */ for_each_cpu(cpu, &watchdog_allowed_mask) { per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT; wq_watchdog_touch(cpu); } } void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } static int is_softlockup(unsigned long touch_ts, unsigned long period_ts, unsigned long now) { if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) { /* * If period_ts has not been updated during a sample_period, then * in the subsequent few sample_periods, period_ts might also not * be updated, which could indicate a potential softlockup. In * this case, if we suspect the cause of the potential softlockup * might be interrupt storm, then we need to count the interrupts * to find which interrupt is storming. */ if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) && need_counting_irqs()) start_counting_irqs(); /* * A poorly behaving BPF scheduler can live-lock the system into * soft lockups. Tell sched_ext to try ejecting the BPF * scheduler when close to a soft lockup. */ if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4)) scx_softlockup(now - touch_ts); /* Warn about unreasonable delays. */ if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; } return 0; } /* watchdog detector functions */ static DEFINE_PER_CPU(struct completion, softlockup_completion); static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); /* * The watchdog feed function - touches the timestamp. * * It only runs once every sample_period seconds (4 seconds by * default) to reset the softlockup timestamp. If this gets delayed * for more than 2*watchdog_thresh seconds then the debug-printout * triggers in watchdog_timer_fn(). */ static int softlockup_fn(void *data) { update_touch_ts(); stop_counting_irqs(); complete(this_cpu_ptr(&softlockup_completion)); return 0; } /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; unsigned long flags; if (!watchdog_enabled) return HRTIMER_NORESTART; watchdog_hardlockup_kick(); /* kick the softlockup detector */ if (completion_done(this_cpu_ptr(&softlockup_completion))) { reinit_completion(this_cpu_ptr(&softlockup_completion)); stop_one_cpu_nowait(smp_processor_id(), softlockup_fn, NULL, this_cpu_ptr(&softlockup_stop_work)); } /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); /* * Read the current timestamp first. It might become invalid anytime * when a virtual machine is stopped by the host or when the watchog * is touched from NMI. */ now = get_timestamp(); /* * If a virtual machine is stopped by the host it can look to * the watchdog like a soft lockup. This function touches the watchdog. */ kvm_check_and_clear_guest_paused(); /* * The stored timestamp is comparable with @now only when not touched. * It might get touched anytime from NMI. Make sure that is_softlockup() * uses the same (valid) value. */ period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts)); update_cpustat(); /* Reset the interval when touched by known problematic code. */ if (period_ts == SOFTLOCKUP_DELAY_REPORT) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { /* * If the time stamp was touched atomically * make sure the scheduler tick is up to date. */ __this_cpu_write(softlockup_touch_sync, false); sched_clock_tick(); } update_report_ts(); return HRTIMER_RESTART; } /* Check for a softlockup. */ touch_ts = __this_cpu_read(watchdog_touch_ts); duration = is_softlockup(touch_ts, period_ts, now); if (unlikely(duration)) { /* * Prevent multiple soft-lockup reports if one cpu is already * engaged in dumping all cpu back traces. */ if (softlockup_all_cpu_backtrace) { if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn)) return HRTIMER_RESTART; } /* Start period for the next softlockup warning. */ update_report_ts(); printk_cpu_sync_get_irqsave(flags); pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); report_cpu_status(); print_modules(); print_irqtrace_events(current); if (regs) show_regs(regs); else dump_stack(); printk_cpu_sync_put_irqrestore(flags); if (softlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(smp_processor_id()); if (!softlockup_panic) clear_bit_unlock(0, &soft_lockup_nmi_warn); } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); } return HRTIMER_RESTART; } static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); struct completion *done = this_cpu_ptr(&softlockup_completion); WARN_ON_ONCE(cpu != smp_processor_id()); init_completion(done); complete(done); /* * Start the timer first to prevent the hardlockup watchdog triggering * before the timer has a chance to fire. */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer->function = watchdog_timer_fn; hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED_HARD); /* Initialize timestamp */ update_touch_ts(); /* Enable the hardlockup detector */ if (watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED) watchdog_hardlockup_enable(cpu); } static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); WARN_ON_ONCE(cpu != smp_processor_id()); /* * Disable the hardlockup detector first. That prevents that a large * delay between disabling the timer and disabling the hardlockup * detector causes a false positive. */ watchdog_hardlockup_disable(cpu); hrtimer_cancel(hrtimer); wait_for_completion(this_cpu_ptr(&softlockup_completion)); } static int softlockup_stop_fn(void *data) { watchdog_disable(smp_processor_id()); return 0; } static void softlockup_stop_all(void) { int cpu; if (!softlockup_initialized) return; for_each_cpu(cpu, &watchdog_allowed_mask) smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false); cpumask_clear(&watchdog_allowed_mask); } static int softlockup_start_fn(void *data) { watchdog_enable(smp_processor_id()); return 0; } static void softlockup_start_all(void) { int cpu; cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); for_each_cpu(cpu, &watchdog_allowed_mask) smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false); } int lockup_detector_online_cpu(unsigned int cpu) { if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) watchdog_enable(cpu); return 0; } int lockup_detector_offline_cpu(unsigned int cpu) { if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) watchdog_disable(cpu); return 0; } static void __lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_hardlockup_stop(); softlockup_stop_all(); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) softlockup_start_all(); watchdog_hardlockup_start(); cpus_read_unlock(); /* * Must be called outside the cpus locked section to prevent * recursive locking in the perf code. */ __lockup_detector_cleanup(); } void lockup_detector_reconfigure(void) { mutex_lock(&watchdog_mutex); __lockup_detector_reconfigure(); mutex_unlock(&watchdog_mutex); } /* * Create the watchdog infrastructure and configure the detector(s). */ static __init void lockup_detector_setup(void) { /* * If sysctl is off and watchdog got disabled on the command line, * nothing to do here. */ lockup_detector_update_enable(); if (!IS_ENABLED(CONFIG_SYSCTL) && !(watchdog_enabled && watchdog_thresh)) return; mutex_lock(&watchdog_mutex); __lockup_detector_reconfigure(); softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_hardlockup_stop(); lockup_detector_update_enable(); watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) { __lockup_detector_reconfigure(); } static inline void lockup_detector_setup(void) { __lockup_detector_reconfigure(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) { lockdep_assert_held(&watchdog_mutex); hardlockup_detector_perf_cleanup(); } /** * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes * * Caller must not hold the cpu hotplug rwsem. */ void lockup_detector_cleanup(void) { mutex_lock(&watchdog_mutex); __lockup_detector_cleanup(); mutex_unlock(&watchdog_mutex); } /** * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) * * Special interface for parisc. It prevents lockup detector warnings from * the default pm_poweroff() function which busy loops forever. */ void lockup_detector_soft_poweroff(void) { watchdog_enabled = 0; } #ifdef CONFIG_SYSCTL /* Propagate any changes to the watchdog infrastructure */ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); __lockup_detector_reconfigure(); } /* * common function for watchdog, nmi_watchdog and soft_watchdog parameter * * caller | table->data points to | 'which' * -------------------|----------------------------------|------------------------------- * proc_watchdog | watchdog_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED | * | | WATCHDOG_SOFTOCKUP_ENABLED * -------------------|----------------------------------|------------------------------- * proc_nmi_watchdog | watchdog_hardlockup_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED * -------------------|----------------------------------|------------------------------- * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED */ static int proc_watchdog_common(int which, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; mutex_lock(&watchdog_mutex); old = *param; if (!write) { /* * On read synchronize the userspace interface. This is a * racy snapshot. */ *param = (watchdog_enabled & which) != 0; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); *param = old; } else { err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && old != READ_ONCE(*param)) proc_watchdog_update(); } mutex_unlock(&watchdog_mutex); return err; } /* * /proc/sys/kernel/watchdog */ static int proc_watchdog(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED | WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } /* * /proc/sys/kernel/nmi_watchdog */ static int proc_nmi_watchdog(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!watchdog_hardlockup_available && write) return -ENOTSUPP; return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED, table, write, buffer, lenp, ppos); } #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* * /proc/sys/kernel/soft_watchdog */ static int proc_soft_watchdog(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } #endif /* * /proc/sys/kernel/watchdog_thresh */ static int proc_watchdog_thresh(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int err, old; mutex_lock(&watchdog_mutex); old = READ_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && write && old != READ_ONCE(watchdog_thresh)) proc_watchdog_update(); mutex_unlock(&watchdog_mutex); return err; } /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the * user to specify a mask that will include cpus that have not yet * been brought online, if desired. */ static int proc_watchdog_cpumask(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int err; mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) proc_watchdog_update(); mutex_unlock(&watchdog_mutex); return err; } static const int sixty = 60; static struct ctl_table watchdog_sysctls[] = { { .procname = "watchdog", .data = &watchdog_user_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "watchdog_thresh", .data = &watchdog_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog_thresh, .extra1 = SYSCTL_ZERO, .extra2 = (void *)&sixty, }, { .procname = "watchdog_cpumask", .data = &watchdog_cpumask_bits, .maxlen = NR_CPUS, .mode = 0644, .proc_handler = proc_watchdog_cpumask, }, #ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", .data = &watchdog_softlockup_user_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_soft_watchdog, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_SMP { .procname = "softlockup_all_cpu_backtrace", .data = &sysctl_softlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif #ifdef CONFIG_HARDLOCKUP_DETECTOR { .procname = "hardlockup_panic", .data = &hardlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", .data = &sysctl_hardlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif }; static struct ctl_table watchdog_hardlockup_sysctl[] = { { .procname = "nmi_watchdog", .data = &watchdog_hardlockup_user_enabled, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_nmi_watchdog, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static void __init watchdog_sysctl_init(void) { register_sysctl_init("kernel", watchdog_sysctls); if (watchdog_hardlockup_available) watchdog_hardlockup_sysctl[0].mode = 0644; register_sysctl_init("kernel", watchdog_hardlockup_sysctl); } #else #define watchdog_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static void __init lockup_detector_delay_init(struct work_struct *work); static bool allow_lockup_detector_init_retry __initdata; static struct work_struct detector_work __initdata = __WORK_INITIALIZER(detector_work, lockup_detector_delay_init); static void __init lockup_detector_delay_init(struct work_struct *work) { int ret; ret = watchdog_hardlockup_probe(); if (ret) { if (ret == -ENODEV) pr_info("NMI not fully supported\n"); else pr_info("Delayed init of the lockup detector failed: %d\n", ret); pr_info("Hard watchdog permanently disabled\n"); return; } allow_lockup_detector_init_retry = false; watchdog_hardlockup_available = true; lockup_detector_setup(); } /* * lockup_detector_retry_init - retry init lockup detector if possible. * * Retry hardlockup detector init. It is useful when it requires some * functionality that has to be initialized later on a particular * platform. */ void __init lockup_detector_retry_init(void) { /* Must be called before late init calls */ if (!allow_lockup_detector_init_retry) return; schedule_work(&detector_work); } /* * Ensure that optional delayed hardlockup init is proceed before * the init code and memory is freed. */ static int __init lockup_detector_check(void) { /* Prevent any later retry. */ allow_lockup_detector_init_retry = false; /* Make sure no work is pending. */ flush_work(&detector_work); watchdog_sysctl_init(); return 0; } late_initcall_sync(lockup_detector_check); void __init lockup_detector_init(void) { if (tick_nohz_full_enabled()) pr_info("Disabling watchdog on nohz_full cores by default\n"); cpumask_copy(&watchdog_cpumask, housekeeping_cpumask(HK_TYPE_TIMER)); if (!watchdog_hardlockup_probe()) watchdog_hardlockup_available = true; else allow_lockup_detector_init_retry = true; lockup_detector_setup(); }
2414 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM printk #if !defined(_TRACE_PRINTK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PRINTK_H #include <linux/tracepoint.h> TRACE_EVENT(console, TP_PROTO(const char *text, size_t len), TP_ARGS(text, len), TP_STRUCT__entry( __dynamic_array(char, msg, len + 1) ), TP_fast_assign( /* * Each trace entry is printed in a new line. * If the msg finishes with '\n', cut it off * to avoid blank lines in the trace. */ if ((len > 0) && (text[len-1] == '\n')) len -= 1; memcpy(__get_str(msg), text, len); __get_str(msg)[len] = 0; ), TP_printk("%s", __get_str(msg)) ); #endif /* _TRACE_PRINTK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DCCP_H #define _LINUX_DCCP_H #include <linux/in.h> #include <linux/interrupt.h> #include <linux/ktime.h> #include <linux/list.h> #include <linux/uio.h> #include <linux/workqueue.h> #include <net/inet_connection_sock.h> #include <net/inet_sock.h> #include <net/inet_timewait_sock.h> #include <net/tcp_states.h> #include <uapi/linux/dccp.h> enum dccp_state { DCCP_OPEN = TCP_ESTABLISHED, DCCP_REQUESTING = TCP_SYN_SENT, DCCP_LISTEN = TCP_LISTEN, DCCP_RESPOND = TCP_SYN_RECV, /* * States involved in closing a DCCP connection: * 1) ACTIVE_CLOSEREQ is entered by a server sending a CloseReq. * * 2) CLOSING can have three different meanings (RFC 4340, 8.3): * a. Client has performed active-close, has sent a Close to the server * from state OPEN or PARTOPEN, and is waiting for the final Reset * (in this case, SOCK_DONE == 1). * b. Client is asked to perform passive-close, by receiving a CloseReq * in (PART)OPEN state. It sends a Close and waits for final Reset * (in this case, SOCK_DONE == 0). * c. Server performs an active-close as in (a), keeps TIMEWAIT state. * * 3) The following intermediate states are employed to give passively * closing nodes a chance to process their unread data: * - PASSIVE_CLOSE (from OPEN => CLOSED) and * - PASSIVE_CLOSEREQ (from (PART)OPEN to CLOSING; case (b) above). */ DCCP_ACTIVE_CLOSEREQ = TCP_FIN_WAIT1, DCCP_PASSIVE_CLOSE = TCP_CLOSE_WAIT, /* any node receiving a Close */ DCCP_CLOSING = TCP_CLOSING, DCCP_TIME_WAIT = TCP_TIME_WAIT, DCCP_CLOSED = TCP_CLOSE, DCCP_NEW_SYN_RECV = TCP_NEW_SYN_RECV, DCCP_PARTOPEN = TCP_MAX_STATES, DCCP_PASSIVE_CLOSEREQ, /* clients receiving CloseReq */ DCCP_MAX_STATES }; enum { DCCPF_OPEN = TCPF_ESTABLISHED, DCCPF_REQUESTING = TCPF_SYN_SENT, DCCPF_LISTEN = TCPF_LISTEN, DCCPF_RESPOND = TCPF_SYN_RECV, DCCPF_ACTIVE_CLOSEREQ = TCPF_FIN_WAIT1, DCCPF_CLOSING = TCPF_CLOSING, DCCPF_TIME_WAIT = TCPF_TIME_WAIT, DCCPF_CLOSED = TCPF_CLOSE, DCCPF_NEW_SYN_RECV = TCPF_NEW_SYN_RECV, DCCPF_PARTOPEN = (1 << DCCP_PARTOPEN), }; static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) { return (struct dccp_hdr *)skb_transport_header(skb); } static inline struct dccp_hdr *dccp_zeroed_hdr(struct sk_buff *skb, int headlen) { skb_push(skb, headlen); skb_reset_transport_header(skb); return memset(skb_transport_header(skb), 0, headlen); } static inline struct dccp_hdr_ext *dccp_hdrx(const struct dccp_hdr *dh) { return (struct dccp_hdr_ext *)((unsigned char *)dh + sizeof(*dh)); } static inline unsigned int __dccp_basic_hdr_len(const struct dccp_hdr *dh) { return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); } static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); return __dccp_basic_hdr_len(dh); } static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) { __u64 seq_nr = ntohs(dh->dccph_seq); if (dh->dccph_x != 0) seq_nr = (seq_nr << 32) + ntohl(dccp_hdrx(dh)->dccph_seq_low); else seq_nr += (u32)dh->dccph_seq2 << 16; return seq_nr; } static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) { return (struct dccp_hdr_request *)(skb_transport_header(skb) + dccp_basic_hdr_len(skb)); } static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) { return (struct dccp_hdr_ack_bits *)(skb_transport_header(skb) + dccp_basic_hdr_len(skb)); } static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) { const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + ntohl(dhack->dccph_ack_nr_low); } static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) { return (struct dccp_hdr_response *)(skb_transport_header(skb) + dccp_basic_hdr_len(skb)); } static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) { return (struct dccp_hdr_reset *)(skb_transport_header(skb) + dccp_basic_hdr_len(skb)); } static inline unsigned int __dccp_hdr_len(const struct dccp_hdr *dh) { return __dccp_basic_hdr_len(dh) + dccp_packet_hdr_len(dh->dccph_type); } static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) { return __dccp_hdr_len(dccp_hdr(skb)); } /** * struct dccp_request_sock - represent DCCP-specific connection request * @dreq_inet_rsk: structure inherited from * @dreq_iss: initial sequence number, sent on the first Response (RFC 4340, 7.1) * @dreq_gss: greatest sequence number sent (for retransmitted Responses) * @dreq_isr: initial sequence number received in the first Request * @dreq_gsr: greatest sequence number received (for retransmitted Request(s)) * @dreq_service: service code present on the Request (there is just one) * @dreq_featneg: feature negotiation options for this connection * The following two fields are analogous to the ones in dccp_sock: * @dreq_timestamp_echo: last received timestamp to echo (13.1) * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo */ struct dccp_request_sock { struct inet_request_sock dreq_inet_rsk; __u64 dreq_iss; __u64 dreq_gss; __u64 dreq_isr; __u64 dreq_gsr; __be32 dreq_service; spinlock_t dreq_lock; struct list_head dreq_featneg; __u32 dreq_timestamp_echo; __u32 dreq_timestamp_time; }; static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) { return (struct dccp_request_sock *)req; } extern struct inet_timewait_death_row dccp_death_row; extern int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, struct sk_buff *skb); struct dccp_options_received { u64 dccpor_ndp:48; u32 dccpor_timestamp; u32 dccpor_timestamp_echo; u32 dccpor_elapsed_time; }; struct ccid; enum dccp_role { DCCP_ROLE_UNDEFINED, DCCP_ROLE_LISTEN, DCCP_ROLE_CLIENT, DCCP_ROLE_SERVER, }; struct dccp_service_list { __u32 dccpsl_nr; __be32 dccpsl_list[]; }; #define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1) #define DCCP_SERVICE_CODE_IS_ABSENT 0 static inline bool dccp_list_has_service(const struct dccp_service_list *sl, const __be32 service) { if (likely(sl != NULL)) { u32 i = sl->dccpsl_nr; while (i--) if (sl->dccpsl_list[i] == service) return true; } return false; } struct dccp_ackvec; /** * struct dccp_sock - DCCP socket state * * @dccps_swl - sequence number window low * @dccps_swh - sequence number window high * @dccps_awl - acknowledgement number window low * @dccps_awh - acknowledgement number window high * @dccps_iss - initial sequence number sent * @dccps_isr - initial sequence number received * @dccps_osr - first OPEN sequence number received * @dccps_gss - greatest sequence number sent * @dccps_gsr - greatest valid sequence number received * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss * @dccps_service - first (passive sock) or unique (active sock) service code * @dccps_service_list - second .. last service code on passive socket * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo * @dccps_l_ack_ratio - feature-local Ack Ratio * @dccps_r_ack_ratio - feature-remote Ack Ratio * @dccps_l_seq_win - local Sequence Window (influences ack number validity) * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) * @dccps_pcslen - sender partial checksum coverage (via sockopt) * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) * @dccps_hc_rx_ackvec - rx half connection ack vector * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) * @dccps_options_received - parsed set of retrieved options * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy * @dccps_tx_qlen - maximum length of the TX queue * @dccps_role - role of this sock, one of %dccp_role * @dccps_hc_rx_insert_options - receiver wants to add options when acking * @dccps_hc_tx_insert_options - sender wants to add options when sending * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) */ struct dccp_sock { /* inet_connection_sock has to be the first member of dccp_sock */ struct inet_connection_sock dccps_inet_connection; #define dccps_syn_rtt dccps_inet_connection.icsk_ack.lrcvtime __u64 dccps_swl; __u64 dccps_swh; __u64 dccps_awl; __u64 dccps_awh; __u64 dccps_iss; __u64 dccps_isr; __u64 dccps_osr; __u64 dccps_gss; __u64 dccps_gsr; __u64 dccps_gar; __be32 dccps_service; __u32 dccps_mss_cache; struct dccp_service_list *dccps_service_list; __u32 dccps_timestamp_echo; __u32 dccps_timestamp_time; __u16 dccps_l_ack_ratio; __u16 dccps_r_ack_ratio; __u64 dccps_l_seq_win:48; __u64 dccps_r_seq_win:48; __u8 dccps_pcslen:4; __u8 dccps_pcrlen:4; __u8 dccps_send_ndp_count:1; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct list_head dccps_featneg; struct dccp_ackvec *dccps_hc_rx_ackvec; struct ccid *dccps_hc_rx_ccid; struct ccid *dccps_hc_tx_ccid; struct dccp_options_received dccps_options_received; __u8 dccps_qpolicy; __u32 dccps_tx_qlen; enum dccp_role dccps_role:2; __u8 dccps_hc_rx_insert_options:1; __u8 dccps_hc_tx_insert_options:1; __u8 dccps_server_timewait:1; __u8 dccps_sync_scheduled:1; struct tasklet_struct dccps_xmitlet; struct timer_list dccps_xmit_timer; }; #define dccp_sk(ptr) container_of_const(ptr, struct dccp_sock, \ dccps_inet_connection.icsk_inet.sk) static inline const char *dccp_role(const struct sock *sk) { switch (dccp_sk(sk)->dccps_role) { case DCCP_ROLE_UNDEFINED: return "undefined"; case DCCP_ROLE_LISTEN: return "listen"; case DCCP_ROLE_SERVER: return "server"; case DCCP_ROLE_CLIENT: return "client"; } return NULL; } extern void dccp_syn_ack_timeout(const struct request_sock *req); #endif /* _LINUX_DCCP_H */
66 66 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * This file implements the various access functions for the * PROC file system. It is mainly used for debugging and * statistics. * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de> * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de> * * Fixes: * Alan Cox : UDP sockets show the rxqueue/txqueue * using hint flag for the netinfo. * Pauline Middelink : identd support * Alan Cox : Make /proc safer. * Erik Schoenfelder : /proc/net/snmp * Alan Cox : Handle dead sockets properly. * Gerhard Koerting : Show both timers * Alan Cox : Allow inode to be NULL (kernel socket) * Andi Kleen : Add support for open_requests and * split functions for more readibility. * Andi Kleen : Add support for /proc/net/netstat * Arnaldo C. Melo : Convert to seq_file */ #include <linux/types.h> #include <net/net_namespace.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/mptcp.h> #include <net/proto_memory.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/bottom_half.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> #include <net/sock.h> #include <net/raw.h> #define TCPUDP_MIB_MAX MAX_T(u32, UDP_MIB_MAX, TCP_MIB_MAX) /* * Report socket allocation statistics [mea@utu.fi] */ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; int orphans, sockets; orphans = tcp_orphan_count_sum(); sockets = proto_sockets_allocated_sum_positive(&tcp_prot); socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, refcount_read(&net->ipv4.tcp_death_row.tw_refcount) - 1, sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), proto_memory_allocated(&udp_prot)); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); seq_printf(seq, "FRAG: inuse %u memory %lu\n", atomic_read(&net->ipv4.fqdir->rhashtable.nelems), frag_mem_limit(net->ipv4.fqdir)); return 0; } /* snmp items */ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS), SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS), SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS), SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS), SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS), SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS), SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS), SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS), SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS), SNMP_MIB_SENTINEL }; /* Following items are displayed in /proc/net/netstat */ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS), SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS), SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS), SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS), SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS), SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), /* Non RFC4293 fields */ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS), SNMP_MIB_SENTINEL }; static const struct { const char *name; int index; } icmpmibmap[] = { { "DestUnreachs", ICMP_DEST_UNREACH }, { "TimeExcds", ICMP_TIME_EXCEEDED }, { "ParmProbs", ICMP_PARAMETERPROB }, { "SrcQuenchs", ICMP_SOURCE_QUENCH }, { "Redirects", ICMP_REDIRECT }, { "Echos", ICMP_ECHO }, { "EchoReps", ICMP_ECHOREPLY }, { "Timestamps", ICMP_TIMESTAMP }, { "TimestampReps", ICMP_TIMESTAMPREPLY }, { "AddrMasks", ICMP_ADDRESS }, { "AddrMaskReps", ICMP_ADDRESSREPLY }, { NULL, 0 } }; static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX), SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN), SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS), SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS), SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS), SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS), SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB), SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS), SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS), SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT), SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV), SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED), SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS), SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED), SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED), SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED), SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS), SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS), SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER), SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED), SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED), SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO), SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT), SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), SNMP_MIB_ITEM("TCPMemoryPressuresChrono", LINUX_MIB_TCPMEMORYPRESSURESCHRONO), SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD), SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE), SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP), SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT), SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV), SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS), SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ), SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH), SNMP_MIB_ITEM("TCPAORequired", LINUX_MIB_TCPAOREQUIRED), SNMP_MIB_ITEM("TCPAOBad", LINUX_MIB_TCPAOBAD), SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND), SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD), SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS), SNMP_MIB_SENTINEL }; static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, unsigned short *type, int count) { int j; if (count) { seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %sType%u", type[j] & 0x100 ? "Out" : "In", type[j] & 0xff); seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", vals[j]); } } static void icmpmsg_put(struct seq_file *seq) { #define PERLINE 16 int i, count; unsigned short type[PERLINE]; unsigned long vals[PERLINE], val; struct net *net = seq->private; count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]); if (val) { type[count] = i; vals[count++] = val; } if (count == PERLINE) { icmpmsg_put_line(seq, vals, type, count); count = 0; } } icmpmsg_put_line(seq, vals, type, count); #undef PERLINE } static void icmp_put(struct seq_file *seq) { int i; struct net *net = seq->private; atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } /* * Called from the PROCfs module. This outputs /proc/net/snmp. */ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) { struct net *net = seq->private; u64 buff64[IPSTATS_MIB_MAX]; int i; memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64)); seq_puts(seq, "Ip: Forwarding DefaultTTL"); for (i = 0; snmp4_ipstats_list[i].name; i++) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2, READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; snmp4_ipstats_list[i].name; i++) seq_printf(seq, " %llu", buff64[i]); return 0; } static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) { unsigned long buff[TCPUDP_MIB_MAX]; struct net *net = seq->private; int i; memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); seq_puts(seq, "\nTcp:"); for (i = 0; snmp4_tcp_list[i].name; i++) seq_printf(seq, " %s", snmp4_tcp_list[i].name); seq_puts(seq, "\nTcp:"); snmp_get_cpu_field_batch(buff, snmp4_tcp_list, net->mib.tcp_statistics); for (i = 0; snmp4_tcp_list[i].name; i++) { /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", buff[i]); else seq_printf(seq, " %lu", buff[i]); } memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); snmp_get_cpu_field_batch(buff, snmp4_udp_list, net->mib.udp_statistics); seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %lu", buff[i]); memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); snmp_get_cpu_field_batch(buff, snmp4_udp_list, net->mib.udplite_statistics); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %lu", buff[i]); seq_putc(seq, '\n'); return 0; } static int snmp_seq_show(struct seq_file *seq, void *v) { snmp_seq_show_ipstats(seq, v); icmp_put(seq); /* RFC 2011 compatibility */ icmpmsg_put(seq); snmp_seq_show_tcp_udp(seq, v); return 0; } /* * Output /proc/net/netstat */ static int netstat_seq_show(struct seq_file *seq, void *v) { const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1; const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1; struct net *net = seq->private; unsigned long *buff; int i; seq_puts(seq, "TcpExt:"); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %s", snmp4_net_list[i].name); seq_puts(seq, "\nTcpExt:"); buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)), GFP_KERNEL); if (buff) { snmp_get_cpu_field_batch(buff, snmp4_net_list, net->mib.net_statistics); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", buff[i]); } else { for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", snmp_fold_field(net->mib.net_statistics, snmp4_net_list[i].entry)); } seq_puts(seq, "\nIpExt:"); for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); seq_puts(seq, "\nIpExt:"); if (buff) { u64 *buff64 = (u64 *)buff; memset(buff64, 0, ip_cnt * sizeof(u64)); snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %llu", buff64[i]); } else { for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %llu", snmp_fold_field64(net->mib.ip_statistics, snmp4_ipextstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); } kfree(buff); seq_putc(seq, '\n'); mptcp_seq_show(seq); return 0; } static __net_init int ip_proc_init_net(struct net *net) { if (!proc_create_net_single("sockstat", 0444, net->proc_net, sockstat_seq_show, NULL)) goto out_sockstat; if (!proc_create_net_single("netstat", 0444, net->proc_net, netstat_seq_show, NULL)) goto out_netstat; if (!proc_create_net_single("snmp", 0444, net->proc_net, snmp_seq_show, NULL)) goto out_snmp; return 0; out_snmp: remove_proc_entry("netstat", net->proc_net); out_netstat: remove_proc_entry("sockstat", net->proc_net); out_sockstat: return -ENOMEM; } static __net_exit void ip_proc_exit_net(struct net *net) { remove_proc_entry("snmp", net->proc_net); remove_proc_entry("netstat", net->proc_net); remove_proc_entry("sockstat", net->proc_net); } static __net_initdata struct pernet_operations ip_proc_ops = { .init = ip_proc_init_net, .exit = ip_proc_exit_net, }; int __init ip_misc_proc_init(void) { return register_pernet_subsys(&ip_proc_ops); }
597 600 423 424 422 426 423 12078 12080 12077 12083 599 598 598 452 215 597 598 600 599 598 292 294 293 2 293 292 293 294 293 293 36 36 36 36 236 235 236 236 235 236 12775 12786 21097 12785 18171 148 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 */ #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> #include <asm/irq_regs.h> #include <uapi/asm/kvm.h> #include <linux/hardirq.h> #include <linux/pkeys.h> #include <linux/vmalloc.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" #define CREATE_TRACE_POINTS #include <asm/trace/fpu.h> #ifdef CONFIG_X86_64 DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); DEFINE_PER_CPU(u64, xfd_state); #endif /* The FPU state configuration data for kernel and user space */ struct fpu_state_config fpu_kernel_cfg __ro_after_init; struct fpu_state_config fpu_user_cfg __ro_after_init; /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ struct fpstate init_fpstate __ro_after_init; /* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* * Track which context is using the FPU on the CPU: */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? */ bool irq_fpu_usable(void) { if (WARN_ON_ONCE(in_nmi())) return false; /* In kernel FPU usage already active? */ if (this_cpu_read(in_kernel_fpu)) return false; /* * When not in NMI or hard interrupt context, FPU can be used in: * * - Task context except from within fpregs_lock()'ed critical * regions. * * - Soft interrupt processing context which cannot happen * while in a fpregs_lock()'ed critical region. */ if (!in_hardirq()) return true; /* * In hard interrupt context it's safe when soft interrupts * are enabled, which means the interrupt did not hit in * a fpregs_lock()'ed critical region. */ return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); /* * Track AVX512 state use because it is known to slow the max clock * speed of the core. */ static void update_avx_timestamp(struct fpu *fpu) { #define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK) fpu->avx512_timestamp = jiffies; } /* * Save the FPU register state in fpu->fpstate->regs. The register state is * preserved. * * Must be called with fpregs_lock() held. * * The legacy FNSAVE instruction clears all FPU state unconditionally, so * register state has to be reloaded. That might be a pointless exercise * when the FPU is going to be used by another task right after that. But * this only affects 20+ years old 32bit systems and avoids conditionals all * over the place. * * FXSAVE and all XSAVE variants preserve the FPU register state. */ void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { os_xsave(fpu->fpstate); update_avx_timestamp(fpu); return; } if (likely(use_fxsr())) { fxsave(&fpu->fpstate->regs.fxsave); return; } /* * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to reload them from the memory state. */ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); frstor(&fpu->fpstate->regs.fsave); } void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) { /* * AMD K7/K8 and later CPUs up to Zen don't save/restore * FDP/FIP/FOP unless an exception is pending. Clear the x87 state * here by setting it to fixed values. "m" is a random variable * that should be in L1. */ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" "fildl %[addr]" /* set F?P to defined value */ : : [addr] "m" (*fpstate)); } if (use_xsave()) { /* * Dynamically enabled features are enabled in XCR0, but * usage requires also that the corresponding bits in XFD * are cleared. If the bits are set then using a related * instruction will raise #NM. This allows to do the * allocation of the larger FPU buffer lazy from #NM or if * the task has no permission to kill it which would happen * via #UD if the feature is disabled in XCR0. * * XFD state is following the same life time rules as * XSTATE and to restore state correctly XFD has to be * updated before XRSTORS otherwise the component would * stay in or go into init state even if the bits are set * in fpstate::regs::xsave::xfeatures. */ xfd_update_state(fpstate); /* * Restoring state always needs to modify all features * which are in @mask even if the current task cannot use * extended features. * * So fpstate->xfeatures cannot be used here, because then * a feature for which the task has no permission but was * used by the previous task would not go into init state. */ mask = fpu_kernel_cfg.max_features & mask; os_xrstor(fpstate, mask); } else { if (use_fxsr()) fxrstor(&fpstate->regs.fxsave); else frstor(&fpstate->regs.fsave); } } void fpu_reset_from_exception_fixup(void) { restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); } #if IS_ENABLED(CONFIG_KVM) static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); static void fpu_init_guest_permissions(struct fpu_guest *gfpu) { struct fpu_state_perm *fpuperm; u64 perm; if (!IS_ENABLED(CONFIG_X86_64)) return; spin_lock_irq(&current->sighand->siglock); fpuperm = &current->group_leader->thread.fpu.guest_perm; perm = fpuperm->__state_perm; /* First fpstate allocation locks down permissions. */ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); spin_unlock_irq(&current->sighand->siglock); gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; } bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) { struct fpstate *fpstate; unsigned int size; size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); fpstate = vzalloc(size); if (!fpstate) return false; /* Leave xfd to 0 (the reset value defined by spec) */ __fpstate_reset(fpstate, 0); fpstate_init_user(fpstate); fpstate->is_valloc = true; fpstate->is_guest = true; gfpu->fpstate = fpstate; gfpu->xfeatures = fpu_user_cfg.default_features; gfpu->perm = fpu_user_cfg.default_features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state * to userspace, even when XSAVE is unsupported, so that restoring FPU * state on a different CPU that does support XSAVE can cleanly load * the incoming state using its natural XSAVE. In other words, KVM's * uABI size may be larger than this host's default size. Conversely, * the default size should never be larger than KVM's base uABI size; * all features that can expand the uABI size must be opt-in. */ gfpu->uabi_size = sizeof(struct kvm_xsave); if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) gfpu->uabi_size = fpu_user_cfg.default_size; fpu_init_guest_permissions(gfpu); return true; } EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); void fpu_free_guest_fpstate(struct fpu_guest *gfpu) { struct fpstate *fps = gfpu->fpstate; if (!fps) return; if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) return; gfpu->fpstate = NULL; vfree(fps); } EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); /* * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable * @guest_fpu: Pointer to the guest FPU container * @xfeatures: Features requested by guest CPUID * * Enable all dynamic xfeatures according to guest perm and requested CPUID. * * Return: 0 on success, error code otherwise */ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) { lockdep_assert_preemption_enabled(); /* Nothing to do if all requested features are already enabled. */ xfeatures &= ~guest_fpu->xfeatures; if (!xfeatures) return 0; return __xfd_enable_feature(xfeatures, guest_fpu); } EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); #ifdef CONFIG_X86_64 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { fpregs_lock(); guest_fpu->fpstate->xfd = xfd; if (guest_fpu->fpstate->in_use) xfd_update_state(guest_fpu->fpstate); fpregs_unlock(); } EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); /** * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state * * Must be invoked from KVM after a VMEXIT before enabling interrupts when * XFD write emulation is disabled. This is required because the guest can * freely modify XFD and the state at VMEXIT is not guaranteed to be the * same as the state on VMENTER. So software state has to be updated before * any operation which depends on it can take place. * * Note: It can be invoked unconditionally even when write emulation is * enabled for the price of a then pointless MSR read. */ void fpu_sync_guest_vmexit_xfd_state(void) { struct fpstate *fps = current->thread.fpu.fpstate; lockdep_assert_irqs_disabled(); if (fpu_state_size_dynamic()) { rdmsrl(MSR_IA32_XFD, fps->xfd); __this_cpu_write(xfd_state, fps->xfd); } } EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); #endif /* CONFIG_X86_64 */ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) { struct fpstate *guest_fps = guest_fpu->fpstate; struct fpu *fpu = &current->thread.fpu; struct fpstate *cur_fps = fpu->fpstate; fpregs_lock(); if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); /* Swap fpstate */ if (enter_guest) { fpu->__task_fpstate = cur_fps; fpu->fpstate = guest_fps; guest_fps->in_use = true; } else { guest_fps->in_use = false; fpu->fpstate = fpu->__task_fpstate; fpu->__task_fpstate = NULL; } cur_fps = fpu->fpstate; if (!cur_fps->is_confidential) { /* Includes XFD update */ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); } else { /* * XSTATE is restored by firmware from encrypted * memory. Make sure XFD state is correct while * running with guest fpstate */ xfd_update_state(cur_fps); } fpregs_mark_activate(); fpregs_unlock(); return 0; } EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u64 xfeatures, u32 pkru) { struct fpstate *kstate = gfpu->fpstate; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru, XSTATE_COPY_XSAVE); } else { memcpy(&ustate->fxsave, &kstate->regs.fxsave, sizeof(ustate->fxsave)); /* Make it restorable on a XSAVE enabled host */ ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; } } EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru) { struct fpstate *kstate = gfpu->fpstate; const union fpregs_state *ustate = buf; if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) return -EINVAL; if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) return -EINVAL; memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); return 0; } if (ustate->xsave.header.xfeatures & ~xcr0) return -EINVAL; /* * Nullify @vpkru to preserve its current value if PKRU's bit isn't set * in the header. KVM's odd ABI is to leave PKRU untouched in this * case (all other components are eventually re-initialized). */ if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU)) vpkru = NULL; return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); } EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); #endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, true); if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(&current->thread.fpu); } __cpu_invalidate_fpregs_state(); /* Put sane initial values into the control registers. */ if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) ldmxcsr(MXCSR_DEFAULT); if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) asm volatile ("fninit"); } EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); preempt_enable(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); /* * Sync the FPU register state to current's memory register state when the * current task owns the FPU. The hardware register state is preserved. */ void fpu_sync_fpstate(struct fpu *fpu) { WARN_ON_FPU(fpu != &current->thread.fpu); fpregs_lock(); trace_x86_fpu_before_save(fpu); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); trace_x86_fpu_after_save(fpu); fpregs_unlock(); } static inline unsigned int init_fpstate_copy_size(void) { if (!use_xsave()) return fpu_kernel_cfg.default_size; /* XSAVE(S) just needs the legacy and the xstate header part */ return sizeof(init_fpstate.regs.xsave); } static inline void fpstate_init_fxstate(struct fpstate *fpstate) { fpstate->regs.fxsave.cwd = 0x37f; fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; } /* * Legacy x87 fpstate state init: */ static inline void fpstate_init_fstate(struct fpstate *fpstate) { fpstate->regs.fsave.cwd = 0xffff037fu; fpstate->regs.fsave.swd = 0xffff0000u; fpstate->regs.fsave.twd = 0xffffffffu; fpstate->regs.fsave.fos = 0xffff0000u; } /* * Used in two places: * 1) Early boot to setup init_fpstate for non XSAVE systems * 2) fpu_init_fpstate_user() which is invoked from KVM */ void fpstate_init_user(struct fpstate *fpstate) { if (!cpu_feature_enabled(X86_FEATURE_FPU)) { fpstate_init_soft(&fpstate->regs.soft); return; } xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); if (cpu_feature_enabled(X86_FEATURE_FXSR)) fpstate_init_fxstate(fpstate); else fpstate_init_fstate(fpstate); } static void __fpstate_reset(struct fpstate *fpstate, u64 xfd) { /* Initialize sizes and feature masks */ fpstate->size = fpu_kernel_cfg.default_size; fpstate->user_size = fpu_user_cfg.default_size; fpstate->xfeatures = fpu_kernel_cfg.default_features; fpstate->user_xfeatures = fpu_user_cfg.default_features; fpstate->xfd = xfd; } void fpstate_reset(struct fpu *fpu) { /* Set the fpstate pointer to the default fpstate */ fpu->fpstate = &fpu->__fpstate; __fpstate_reset(fpu->fpstate, init_fpstate.xfd); /* Initialize the permission related info in fpu */ fpu->perm.__state_perm = fpu_kernel_cfg.default_features; fpu->perm.__state_size = fpu_kernel_cfg.default_size; fpu->perm.__user_state_size = fpu_user_cfg.default_size; /* Same defaults for guests */ fpu->guest_perm = fpu->perm; } static inline void fpu_inherit_perms(struct fpu *dst_fpu) { if (fpu_state_size_dynamic()) { struct fpu *src_fpu = &current->group_leader->thread.fpu; spin_lock_irq(&current->sighand->siglock); /* Fork also inherits the permissions of the parent */ dst_fpu->perm = src_fpu->perm; dst_fpu->guest_perm = src_fpu->guest_perm; spin_unlock_irq(&current->sighand->siglock); } } /* A passed ssp of zero will not cause any update */ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) { #ifdef CONFIG_X86_USER_SHADOW_STACK struct cet_user_state *xstate; /* If ssp update is not needed. */ if (!ssp) return 0; xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, XFEATURE_CET_USER); /* * If there is a non-zero ssp, then 'dst' must be configured with a shadow * stack and the fpu state should be up to date since it was just copied * from the parent in fpu_clone(). So there must be a valid non-init CET * state location in the buffer. */ if (WARN_ON_ONCE(!xstate)) return 1; xstate->user_ssp = (u64)ssp; #endif return 0; } /* Clone current's FPU state on fork */ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, unsigned long ssp) { struct fpu *src_fpu = &current->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; fpstate_reset(dst_fpu); if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; /* * Enforce reload for user space tasks and prevent kernel threads * from trying to save the FPU registers on context switch. */ set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); /* * No FPU state inheritance for kernel threads and IO * worker threads. */ if (minimal) { /* Clear out the minimal state */ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); return 0; } /* * If a new feature is added, ensure all dynamic features are * caller-saved from here! */ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); /* * Save the default portion of the current FPU state into the * clone. Assume all dynamic features to be defined as caller- * saved, which enables skipping both the expansion of fpstate * and the copying of any dynamic state. * * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because * copying is not valid when current uses non-default states. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); save_fpregs_to_fpstate(dst_fpu); fpregs_unlock(); if (!(clone_flags & CLONE_THREAD)) fpu_inherit_perms(dst_fpu); /* * Children never inherit PASID state. * Force it to have its init value: */ if (use_xsave()) dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; /* * Update shadow stack pointer, in case it changed during clone. */ if (update_fpu_shstk(dst, ssp)) return 1; trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); return 0; } /* * Whitelist the FPU register state embedded into task_struct for hardened * usercopy. */ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); *size = fpu_kernel_cfg.default_size; } /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. * * This function can be used in cases where we know that * a state-restore is coming: either an explicit one, * or a reschedule. */ void fpu__drop(struct fpu *fpu) { preempt_disable(); if (fpu == &current->thread.fpu) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); fpregs_deactivate(fpu); } trace_x86_fpu_dropped(fpu); preempt_enable(); } /* * Clear FPU registers by setting them up from the init fpstate. * Caller must do fpregs_[un]lock() around it. */ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { if (use_xsave()) os_xrstor(&init_fpstate, features_mask); else if (use_fxsr()) fxrstor(&init_fpstate.regs.fxsave); else frstor(&init_fpstate.regs.fsave); pkru_write_default(); } /* * Reset current->fpu memory state to the init values. */ static void fpu_reset_fpregs(void) { struct fpu *fpu = &current->thread.fpu; fpregs_lock(); __fpu_invalidate_fpregs_state(fpu); /* * This does not change the actual hardware registers. It just * resets the memory image and sets TIF_NEED_FPU_LOAD so a * subsequent return to usermode will reload the registers from the * task's memory image. * * Do not use fpstate_init() here. Just copy init_fpstate which has * the correct content already except for PKRU. * * PKRU handling does not rely on the xstate when restoring for * user space as PKRU is eagerly written in switch_to() and * flush_thread(). */ memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } /* * Reset current's user FPU states to the init states. current's * supervisor states, if any, are not modified by this function. The * caller guarantees that the XSTATE header in memory is intact. */ void fpu__clear_user_states(struct fpu *fpu) { WARN_ON_FPU(fpu != &current->thread.fpu); fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { fpu_reset_fpregs(); fpregs_unlock(); return; } /* * Ensure that current's supervisor states are loaded into their * corresponding registers. */ if (xfeatures_mask_supervisor() && !fpregs_state_valid(fpu, smp_processor_id())) os_xrstor_supervisor(fpu->fpstate); /* Reset user states in registers. */ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); /* * Now all FPU registers have their desired values. Inform the FPU * state machine that current's FPU registers are in the hardware * registers. The memory image does not need to be updated because * any operation relying on it has to save the registers first when * current's FPU is marked active. */ fpregs_mark_activate(); fpregs_unlock(); } void fpu_flush_thread(void) { fpstate_reset(&current->thread.fpu); fpu_reset_fpregs(); } /* * Load FPU context before returning to userspace. */ void switch_fpu_return(void) { if (!static_cpu_has(X86_FEATURE_FPU)) return; fpregs_restore_userregs(); } EXPORT_SYMBOL_GPL(switch_fpu_return); void fpregs_lock_and_load(void) { /* * fpregs_lock() only disables preemption (mostly). So modifying state * in an interrupt could screw up some in progress fpregs operation. * Warn about it. */ WARN_ON_ONCE(!irq_fpu_usable()); WARN_ON_ONCE(current->flags & PF_KTHREAD); fpregs_lock(); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); } #ifdef CONFIG_X86_DEBUG_FPU /* * If current FPU state according to its tracking (loaded FPU context on this * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is * loaded on return to userland. */ void fpregs_assert_state_consistent(void) { struct fpu *fpu = &current->thread.fpu; if (test_thread_flag(TIF_NEED_FPU_LOAD)) return; WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); } EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); #endif void fpregs_mark_activate(void) { struct fpu *fpu = &current->thread.fpu; fpregs_activate(fpu); fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } /* * x87 math exception handling: */ int fpu__exception_code(struct fpu *fpu, int trap_nr) { int err; if (trap_nr == X86_TRAP_MF) { unsigned short cwd, swd; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the * C1 reg you need in case of a stack fault, 0x040 is the stack * fault bit. We should only be taking one exception at a time, * so if this combination doesn't produce any single exception, * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { cwd = fpu->fpstate->regs.fxsave.cwd; swd = fpu->fpstate->regs.fxsave.swd; } else { cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd; swd = (unsigned short)fpu->fpstate->regs.fsave.swd; } err = swd & ~cwd; } else { /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which * unmasked exception was caught we must mask the exception mask bits * at 0x1f80, and then use these to mask the exception bits at 0x3f. */ unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) mxcsr = fpu->fpstate->regs.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ return FPE_FLTINV; } else if (err & 0x004) { /* Divide by Zero */ return FPE_FLTDIV; } else if (err & 0x008) { /* Overflow */ return FPE_FLTOVF; } else if (err & 0x012) { /* Denormal, Underflow */ return FPE_FLTUND; } else if (err & 0x020) { /* Precision */ return FPE_FLTRES; } /* * If we're using IRQ 13, or supposedly even some trap * X86_TRAP_MF implementations, it's possible * we get a spurious trap, which is not an error. */ return 0; } /* * Initialize register state that may prevent from entering low-power idle. * This function will be invoked from the cpuidle driver only when needed. */ noinstr void fpu_idle_fpregs(void) { /* Note: AMX_TILE being enabled implies XGETBV1 support */ if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { tile_release(); __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } }
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 // SPDX-License-Identifier: GPL-2.0 #include <linux/utsname.h> #include <net/cfg80211.h> #include "core.h" #include "rdev-ops.h" void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct device *pdev = wiphy_dev(wdev->wiphy); if (pdev->driver) strscpy(info->driver, pdev->driver->name, sizeof(info->driver)); else strscpy(info->driver, "N/A", sizeof(info->driver)); strscpy(info->version, init_utsname()->release, sizeof(info->version)); if (wdev->wiphy->fw_version[0]) strscpy(info->fw_version, wdev->wiphy->fw_version, sizeof(info->fw_version)); else strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); strscpy(info->bus_info, dev_name(wiphy_dev(wdev->wiphy)), sizeof(info->bus_info)); } EXPORT_SYMBOL(cfg80211_get_drvinfo);
12 12 12 18986 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> * * Based on the original implementation which is: * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * * Parts of the original code have been moved to arch/x86/vdso/vma.c * * This file implements vsyscall emulation. vsyscalls are a legacy ABI: * Userspace can request certain kernel services by calling fixed * addresses. This concept is problematic: * * - It interferes with ASLR. * - It's awkward to write code that lives in kernel addresses but is * callable by userspace at fixed addresses. * - The whole concept is impossible for 32-bit compat userspace. * - UML cannot easily virtualize a vsyscall. * * As of mid-2014, I believe that there is no new userspace code that * will use a vsyscall if the vDSO is present. I hope that there will * soon be no new userspace code that will ever use a vsyscall. * * The code in this file emulates vsyscalls when notified of a page * fault to a vsyscall address. */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include <linux/mm_types.h> #include <linux/syscalls.h> #include <linux/ratelimit.h> #include <asm/vsyscall.h> #include <asm/unistd.h> #include <asm/fixmap.h> #include <asm/traps.h> #include <asm/paravirt.h> #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = #ifdef CONFIG_LEGACY_VSYSCALL_NONE NONE; #elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) XONLY; #else #error VSYSCALL config is broken #endif static int __init vsyscall_setup(char *str) { if (str) { if (!strcmp("emulate", str)) vsyscall_mode = EMULATE; else if (!strcmp("xonly", str)) vsyscall_mode = XONLY; else if (!strcmp("none", str)) vsyscall_mode = NONE; else return -EINVAL; return 0; } return -EINVAL; } early_param("vsyscall", vsyscall_setup); static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { if (!show_unhandled_signals) return; printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n", level, current->comm, task_pid_nr(current), message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) { int nr; if ((addr & ~0xC00UL) != VSYSCALL_ADDR) return -EINVAL; nr = (addr & 0xC00UL) >> 10; if (nr >= 3) return -EINVAL; return nr; } static bool write_ok_or_segv(unsigned long ptr, size_t size) { if (!access_ok((void __user *)ptr, size)) { struct thread_struct *thread = &current->thread; thread->error_code = X86_PF_USER | X86_PF_WRITE; thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF; force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr); return false; } else { return true; } } bool emulate_vsyscall(unsigned long error_code, struct pt_regs *regs, unsigned long address) { unsigned long caller; int vsyscall_nr, syscall_nr, tmp; long ret; unsigned long orig_dx; /* Write faults or kernel-privilege faults never get fixed up. */ if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) return false; if (!(error_code & X86_PF_INSTR)) { /* Failed vsyscall read */ if (vsyscall_mode == EMULATE) return false; /* * User code tried and failed to read the vsyscall page. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround"); return false; } /* * No point in checking CS -- the only way to get here is a user mode * trap to a high address, which means that we're in 64-bit user code. */ WARN_ON_ONCE(address != regs->ip); if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); return false; } vsyscall_nr = addr_to_vsyscall_nr(address); trace_emulate_vsyscall(vsyscall_nr); if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); goto sigsegv; } if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { warn_bad_vsyscall(KERN_WARNING, regs, "vsyscall with bad stack (exploit attempt?)"); goto sigsegv; } /* * Check for access_ok violations and find the syscall nr. * * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ switch (vsyscall_nr) { case 0: if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) || !write_ok_or_segv(regs->si, sizeof(struct timezone))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_gettimeofday; break; case 1: if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_time; break; case 2: if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || !write_ok_or_segv(regs->si, sizeof(unsigned))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_getcpu; break; } /* * Handle seccomp. regs->ip must be the original value. * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst. * * We could optimize the seccomp disabled case, but performance * here doesn't matter. */ regs->orig_ax = syscall_nr; regs->ax = -ENOSYS; tmp = secure_computing(); if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); force_exit_sig(SIGSYS); return true; } regs->orig_ax = -1; if (tmp) goto do_ret; /* skip requested */ /* * With a real vsyscall, page faults cause SIGSEGV. */ ret = -EFAULT; switch (vsyscall_nr) { case 0: /* this decodes regs->di and regs->si on its own */ ret = __x64_sys_gettimeofday(regs); break; case 1: /* this decodes regs->di on its own */ ret = __x64_sys_time(regs); break; case 2: /* while we could clobber regs->dx, we didn't in the past... */ orig_dx = regs->dx; regs->dx = 0; /* this decodes regs->di, regs->si and regs->dx on its own */ ret = __x64_sys_getcpu(regs); regs->dx = orig_dx; break; } check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); goto sigsegv; } regs->ax = ret; do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; return true; sigsegv: force_sig(SIGSEGV); return true; } /* * A pseudo VMA to allow ptrace access for the vsyscall page. This only * covers the 64bit vsyscall page now. 32bit has a real VMA now and does * not need special handling anymore: */ static const char *gate_vma_name(struct vm_area_struct *vma) { return "[vsyscall]"; } static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; static struct vm_area_struct gate_vma __ro_after_init = { .vm_start = VSYSCALL_ADDR, .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, .vm_flags = VM_READ | VM_EXEC, .vm_ops = &gate_vma_ops, }; struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_COMPAT if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags)) return NULL; #endif if (vsyscall_mode == NONE) return NULL; return &gate_vma; } int in_gate_area(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; return (addr >= vma->vm_start) && (addr < vma->vm_end); } /* * Use this when you have no reliable mm, typically from interrupt * context. It is less reliable than using a task's mm and may give * false positives. */ int in_gate_area_no_mm(unsigned long addr) { return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; } /* * The VSYSCALL page is the only user-accessible page in the kernel address * range. Normally, the kernel page tables can have _PAGE_USER clear, but * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls * are enabled. * * Some day we may create a "minimal" vsyscall mode in which we emulate * vsyscalls but leave the page not present. If so, we skip calling * this. */ void __init set_vsyscall_pgtable_user_bits(pgd_t *root) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); p4d = p4d_offset(pgd, VSYSCALL_ADDR); #if CONFIG_PGTABLE_LEVELS >= 5 set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); #endif pud = pud_offset(p4d, VSYSCALL_ADDR); set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); pmd = pmd_offset(pud, VSYSCALL_ADDR); set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); } void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); /* * For full emulation, the page needs to exist for real. In * execute-only mode, there is no PTE at all backing the vsyscall * page. */ if (vsyscall_mode == EMULATE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, PAGE_KERNEL_VVAR); set_vsyscall_pgtable_user_bits(swapper_pg_dir); } if (vsyscall_mode == XONLY) vm_flags_init(&gate_vma, VM_EXEC); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); }
2404 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 /* SPDX-License-Identifier: GPL-2.0 */ /* * x86 KFENCE support. * * Copyright (C) 2020, Google LLC. */ #ifndef _ASM_X86_KFENCE_H #define _ASM_X86_KFENCE_H #ifndef MODULE #include <linux/bug.h> #include <linux/kfence.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/tlbflush.h> /* Force 4K pages for __kfence_pool. */ static inline bool arch_kfence_init_pool(void) { unsigned long addr; for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr); addr += PAGE_SIZE) { unsigned int level; if (!lookup_address(addr, &level)) return false; if (level != PG_LEVEL_4K) set_memory_4k(addr, 1); } return true; } /* Protect the given page and flush TLB. */ static inline bool kfence_protect_page(unsigned long addr, bool protect) { unsigned int level; pte_t *pte = lookup_address(addr, &level); if (WARN_ON(!pte || level != PG_LEVEL_4K)) return false; /* * We need to avoid IPIs, as we may get KFENCE allocations or faults * with interrupts disabled. Therefore, the below is best-effort, and * does not flush TLBs on all CPUs. We can tolerate some inaccuracy; * lazy fault handling takes care of faults after the page is PRESENT. */ if (protect) set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); else set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); /* * Flush this CPU's TLB, assuming whoever did the allocation/free is * likely to continue running on this CPU. */ preempt_disable(); flush_tlb_one_kernel(addr); preempt_enable(); return true; } #endif /* !MODULE */ #endif /* _ASM_X86_KFENCE_H */
278 278 35 15 18 12 5 3 7 6 31 27 20 21 14 9 14 176 176 175 176 14 14 14 14 160 160 160 160 3 3 27 13 14 14 14 14 14 14 14 14 14 14 14 14 27 27 27 27 11 11 27 26 11 27 27 27 27 27 27 27 8 27 8 27 27 27 13 27 27 23 7 12 6 14 1 1 2 1 8 10 4 11 4 2 2 14 6 4 4 4 10 6 4 6 14 14 14 14 2 14 5 14 14 14 16 14 14 16 16 16 16 2 2 2 6 3 3 7 7 6 1 1 10 8 2 3 5 3 3 14 42 42 66 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2020, Red Hat, Inc. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/inet.h> #include <linux/kernel.h> #include <net/inet_common.h> #include <net/netns/generic.h> #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include "mptcp_pm_gen.h" static int pm_nl_pernet_id; struct mptcp_pm_add_entry { struct list_head list; struct mptcp_addr_info addr; u8 retrans_times; struct timer_list add_timer; struct mptcp_sock *sock; }; struct pm_nl_pernet { /* protects pernet updates */ spinlock_t lock; struct list_head local_addr_list; unsigned int addrs; unsigned int stale_loss_cnt; unsigned int add_addr_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; unsigned int subflows_max; unsigned int next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); }; #define MPTCP_PM_ADDR_MAX 8 #define ADD_ADDR_RETRANS_MAX 3 static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) { return net_generic(net, pm_nl_pernet_id); } static struct pm_nl_pernet * pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) { return pm_nl_get_pernet(sock_net((struct sock *)msk)); } bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port) { bool addr_equals = false; if (a->family == b->family) { if (a->family == AF_INET) addr_equals = a->addr.s_addr == b->addr.s_addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); } else if (a->family == AF_INET) { if (ipv6_addr_v4mapped(&b->addr6)) addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; } else if (b->family == AF_INET) { if (ipv6_addr_v4mapped(&a->addr6)) addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; #endif } if (!addr_equals) return false; if (!use_port) return true; return a->port == b->port; } void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { addr->family = skc->skc_family; addr->port = htons(skc->skc_num); if (addr->family == AF_INET) addr->addr.s_addr = skc->skc_rcv_saddr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) addr->addr6 = skc->skc_v6_rcv_saddr; #endif } static void remote_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { addr->family = skc->skc_family; addr->port = skc->skc_dport; if (addr->family == AF_INET) addr->addr.s_addr = skc->skc_daddr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) addr->addr6 = skc->skc_v6_daddr; #endif } bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, const struct mptcp_addr_info *saddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; struct sock_common *skc; list_for_each_entry(subflow, list, node) { skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); mptcp_local_address(skc, &cur); if (mptcp_addresses_equal(&cur, saddr, saddr->port)) return true; } return false; } static bool lookup_subflow_by_daddr(const struct list_head *list, const struct mptcp_addr_info *daddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; list_for_each_entry(subflow, list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV))) continue; remote_address((struct sock_common *)ssk, &cur); if (mptcp_addresses_equal(&cur, daddr, daddr->port)) return true; } return false; } static bool select_local_address(const struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, struct mptcp_pm_local *new_local) { struct mptcp_pm_addr_entry *entry; bool found = false; msk_owned_by_me(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; new_local->addr = entry->addr; new_local->flags = entry->flags; new_local->ifindex = entry->ifindex; found = true; break; } rcu_read_unlock(); return found; } static bool select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, struct mptcp_pm_local *new_local) { struct mptcp_pm_addr_entry *entry; bool found = false; rcu_read_lock(); /* do not keep any additional per socket state, just signal * the address list in order. * Note: removal from the local address list during the msk life-cycle * can lead to additional addresses not being announced. */ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) continue; new_local->addr = entry->addr; new_local->flags = entry->flags; new_local->ifindex = entry->ifindex; found = true; break; } rcu_read_unlock(); return found; } unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) { const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->add_addr_signal_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->add_addr_accept_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->subflows_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->local_addr_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); return false; } return true; } struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; lockdep_assert_held(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { if (mptcp_addresses_equal(&entry->addr, addr, true)) return entry; } return NULL; } bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) { struct mptcp_pm_add_entry *entry; struct mptcp_addr_info saddr; bool ret = false; mptcp_local_address((struct sock_common *)sk, &saddr); spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { ret = true; goto out; } } out: spin_unlock_bh(&msk->pm.lock); return ret; } static void mptcp_pm_add_timer(struct timer_list *timer) { struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; pr_debug("msk=%p\n", msk); if (!msk) return; if (inet_sk_state_load(sk) == TCP_CLOSE) return; if (!entry->addr.id) return; if (mptcp_pm_should_add_signal_addr(msk)) { sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); goto out; } spin_lock_bh(&msk->pm.lock); if (!mptcp_pm_should_add_signal_addr(msk)) { pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) sk_reset_timer(sk, timer, jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); spin_unlock_bh(&msk->pm.lock); if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) mptcp_pm_subflow_established(msk); out: __sock_put(sk); } struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id) { struct mptcp_pm_add_entry *entry; struct sock *sk = (struct sock *)msk; struct timer_list *add_timer = NULL; spin_lock_bh(&msk->pm.lock); entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (entry && (!check_id || entry->addr.id == addr->id)) { entry->retrans_times = ADD_ADDR_RETRANS_MAX; add_timer = &entry->add_timer; } if (!check_id && entry) list_del(&entry->list); spin_unlock_bh(&msk->pm.lock); /* no lock, because sk_stop_timer_sync() is calling del_timer_sync() */ if (add_timer) sk_stop_timer_sync(sk, add_timer); return entry; } bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; struct net *net = sock_net(sk); lockdep_assert_held(&msk->pm.lock); add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (add_entry) { if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) return false; sk_reset_timer(sk, &add_entry->add_timer, jiffies + mptcp_get_add_addr_timeout(net)); return true; } add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); if (!add_entry) return false; list_add(&add_entry->list, &msk->pm.anno_list); add_entry->addr = *addr; add_entry->sock = msk; add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); sk_reset_timer(sk, &add_entry->add_timer, jiffies + mptcp_get_add_addr_timeout(net)); return true; } void mptcp_pm_free_anno_list(struct mptcp_sock *msk) { struct mptcp_pm_add_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); pr_debug("msk=%p\n", msk); spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.anno_list, &free_list); spin_unlock_bh(&msk->pm.lock); list_for_each_entry_safe(entry, tmp, &free_list, list) { sk_stop_timer_sync(sk, &entry->add_timer); kfree(entry); } } /* Fill all the remote addresses into the array addrs[], * and return the array size. */ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *local, bool fullmesh, struct mptcp_addr_info *addrs) { bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); struct sock *sk = (struct sock *)msk, *ssk; struct mptcp_subflow_context *subflow; struct mptcp_addr_info remote = { 0 }; unsigned int subflows_max; int i = 0; subflows_max = mptcp_pm_get_subflows_max(msk); remote_address((struct sock_common *)sk, &remote); /* Non-fullmesh endpoint, fill in the single entry * corresponding to the primary MPC subflow remote address */ if (!fullmesh) { if (deny_id0) return 0; if (!mptcp_pm_addr_families_match(sk, local, &remote)) return 0; msk->pm.subflows++; addrs[i++] = remote; } else { DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); /* Forbid creation of new subflows matching existing * ones, possibly already created by incoming ADD_ADDR */ bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); mptcp_for_each_subflow(msk, subflow) if (READ_ONCE(subflow->local_id) == local->id) __set_bit(subflow->remote_id, unavail_id); mptcp_for_each_subflow(msk, subflow) { ssk = mptcp_subflow_tcp_sock(subflow); remote_address((struct sock_common *)ssk, &addrs[i]); addrs[i].id = READ_ONCE(subflow->remote_id); if (deny_id0 && !addrs[i].id) continue; if (test_bit(addrs[i].id, unavail_id)) continue; if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) continue; if (msk->pm.subflows < subflows_max) { /* forbid creating multiple address towards * this id */ __set_bit(addrs[i].id, unavail_id); msk->pm.subflows++; i++; } } } return i; } static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, bool prio, bool backup) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow; pr_debug("send ack for %s\n", prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); slow = lock_sock_fast(ssk); if (prio) { subflow->send_mp_prio = 1; subflow->request_bkup = backup; } __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } static void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, bool prio, bool backup) { spin_unlock_bh(&msk->pm.lock); __mptcp_pm_send_ack(msk, subflow, prio, backup); spin_lock_bh(&msk->pm.lock); } static struct mptcp_pm_addr_entry * __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) { struct mptcp_pm_addr_entry *entry; list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, lockdep_is_held(&pernet->lock)) { if (entry->addr.id == id) return entry; } return NULL; } static struct mptcp_pm_addr_entry * __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) { struct mptcp_pm_addr_entry *entry; list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, lockdep_is_held(&pernet->lock)) { if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) return entry; } return NULL; } static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; unsigned int add_addr_signal_max; bool signal_and_subflow = false; unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; unsigned int subflows_max; pernet = pm_nl_get_pernet(sock_net(sk)); add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); /* do lazy endpoint usage accounting for the MPC subflows */ if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); struct mptcp_pm_addr_entry *entry; struct mptcp_addr_info mpc_addr; bool backup = false; mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); rcu_read_lock(); entry = __lookup_addr(pernet, &mpc_addr); if (entry) { __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); msk->mpc_endpoint_id = entry->addr.id; backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); } rcu_read_unlock(); if (backup) mptcp_pm_send_ack(msk, subflow, true, backup); msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); } pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, msk->pm.subflows, subflows_max); /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the * corresponding id will be marked as used. * Instead let the PM machinery reschedule us when the * current address announce will be completed. */ if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) return; if (!select_signal_address(pernet, msk, &local)) goto subflow; /* If the alloc fails, we are on memory pressure, not worth * continuing, and trying to create subflows. */ if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) return; __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; /* Special case for ID0: set the correct ID */ if (local.addr.id == msk->mpc_endpoint_id) local.addr.id = 0; mptcp_pm_announce_addr(msk, &local.addr, false); mptcp_pm_nl_addr_send_ack(msk); if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) signal_and_subflow = true; } subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && msk->pm.subflows < subflows_max) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; if (signal_and_subflow) signal_and_subflow = false; else if (!select_local_address(pernet, msk, &local)) break; fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH); __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); /* Special case for ID0: set the correct ID */ if (local.addr.id == msk->mpc_endpoint_id) local.addr.id = 0; else /* local_addr_used is not decr for ID 0 */ msk->pm.local_addr_used++; nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); if (nr == 0) continue; spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) __mptcp_subflow_connect(sk, &local, &addrs[i]); spin_lock_bh(&msk->pm.lock); } mptcp_pm_nl_check_work_pending(msk); } static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) { mptcp_pm_create_subflow_or_signal_addr(msk); } static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) { mptcp_pm_create_subflow_or_signal_addr(msk); } /* Fill all the local addresses into the array addrs[], * and return the array size. */ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, struct mptcp_pm_local *locals) { struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; struct mptcp_addr_info mpc_addr; struct pm_nl_pernet *pernet; unsigned int subflows_max; int i = 0; pernet = pm_nl_get_pernet_from_msk(msk); subflows_max = mptcp_pm_get_subflows_max(msk); mptcp_local_address((struct sock_common *)msk, &mpc_addr); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) continue; if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) continue; if (msk->pm.subflows < subflows_max) { locals[i].addr = entry->addr; locals[i].flags = entry->flags; locals[i].ifindex = entry->ifindex; /* Special case for ID0: set the correct ID */ if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) locals[i].addr.id = 0; msk->pm.subflows++; i++; } } rcu_read_unlock(); /* If the array is empty, fill in the single * 'IPADDRANY' local address */ if (!i) { memset(&locals[i], 0, sizeof(locals[i])); locals[i].addr.family = #if IS_ENABLED(CONFIG_MPTCP_IPV6) remote->family == AF_INET6 && ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : #endif remote->family; if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) return 0; msk->pm.subflows++; i++; } return i; } static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; unsigned int add_addr_accept_max; struct mptcp_addr_info remote; unsigned int subflows_max; bool sf_created = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); pr_debug("accepted %d:%d remote family %d\n", msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); remote = msk->pm.remote; mptcp_pm_announce_addr(msk, &remote, true); mptcp_pm_nl_addr_send_ack(msk); if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) return; /* pick id 0 port, if none is provided the remote address */ if (!remote.port) remote.port = sk->sk_dport; /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ nr = fill_local_addresses_vec(msk, &remote, locals); if (nr == 0) return; spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0) sf_created = true; spin_lock_bh(&msk->pm.lock); if (sf_created) { /* add_addr_accepted is not decr for ID 0 */ if (remote.id) msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || msk->pm.subflows >= subflows_max) WRITE_ONCE(msk->pm.accept_addr, false); } } bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *remote) { struct mptcp_addr_info mpc_remote; remote_address((struct sock_common *)msk, &mpc_remote); return mptcp_addresses_equal(&mpc_remote, remote, remote->port); } void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *alt = NULL; msk_owned_by_me(msk); lockdep_assert_held(&msk->pm.lock); if (!mptcp_pm_should_add_signal(msk) && !mptcp_pm_should_rm_signal(msk)) return; mptcp_for_each_subflow(msk, subflow) { if (__mptcp_subflow_active(subflow)) { if (!subflow->stale) { mptcp_pm_send_ack(msk, subflow, false, false); return; } if (!alt) alt = subflow; } } if (alt) mptcp_pm_send_ack(msk, alt, false, false); } int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, struct mptcp_addr_info *rem, u8 bkup) { struct mptcp_subflow_context *subflow; pr_debug("bkup=%d\n", bkup); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct mptcp_addr_info local, remote; mptcp_local_address((struct sock_common *)ssk, &local); if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; if (rem && rem->family != AF_UNSPEC) { remote_address((struct sock_common *)ssk, &remote); if (!mptcp_addresses_equal(&remote, rem, rem->port)) continue; } __mptcp_pm_send_ack(msk, subflow, true, bkup); return 0; } return -EINVAL; } static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list, enum linux_mptcp_mib_field rm_type) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; u8 i; pr_debug("%s rm_list_nr %d\n", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); msk_owned_by_me(msk); if (sk->sk_state == TCP_LISTEN) return; if (!rm_list->nr) return; if (list_empty(&msk->conn_list)) return; for (i = 0; i < rm_list->nr; i++) { u8 rm_id = rm_list->ids[i]; bool removed = false; mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); u8 remote_id = READ_ONCE(subflow->remote_id); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; u8 id = subflow_get_local_id(subflow); if ((1 << inet_sk_state_load(ssk)) & (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE)) continue; if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) continue; if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) continue; pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", i, rm_id, id, remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); removed |= subflow->request_join; /* the following takes care of updating the subflows counter */ mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); if (rm_type == MPTCP_MIB_RMSUBFLOW) __MPTCP_INC_STATS(sock_net(sk), rm_type); } if (rm_type == MPTCP_MIB_RMADDR) __MPTCP_INC_STATS(sock_net(sk), rm_type); if (!removed) continue; if (!mptcp_pm_is_kernel(msk)) continue; if (rm_type == MPTCP_MIB_RMADDR && rm_id && !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { /* Note: if the subflow has been closed before, this * add_addr_accepted counter will not be decremented. */ if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) WRITE_ONCE(msk->pm.accept_addr, true); } } } static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) { mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); } static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); } void mptcp_pm_nl_work(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; msk_owned_by_me(msk); if (!(pm->status & MPTCP_PM_WORK_MASK)) return; spin_lock_bh(&msk->pm.lock); pr_debug("msk=%p status=%x\n", msk, pm->status); if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); mptcp_pm_nl_add_addr_received(msk); } if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); mptcp_pm_nl_addr_send_ack(msk); } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); mptcp_pm_nl_rm_addr_received(msk); } if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); mptcp_pm_nl_fully_established(msk); } if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); mptcp_pm_nl_subflow_established(msk); } spin_unlock_bh(&msk->pm.lock); } static bool address_use_port(struct mptcp_pm_addr_entry *entry) { return (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == MPTCP_PM_ADDR_FLAG_SIGNAL; } /* caller must ensure the RCU grace period is already elapsed */ static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) { if (entry->lsk) sock_release(entry->lsk); kfree(entry); } static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry, bool needs_id) { struct mptcp_pm_addr_entry *cur, *del_entry = NULL; unsigned int addr_max; int ret = -EINVAL; spin_lock_bh(&pernet->lock); /* to keep the code simple, don't do IDR-like allocation for address ID, * just bail when we exceed limits */ if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) pernet->next_id = 1; if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { ret = -ERANGE; goto out; } if (test_bit(entry->addr.id, pernet->id_bitmap)) { ret = -EBUSY; goto out; } /* do not insert duplicate address, differentiate on port only * singled addresses */ if (!address_use_port(entry)) entry->addr.port = 0; list_for_each_entry(cur, &pernet->local_addr_list, list) { if (mptcp_addresses_equal(&cur->addr, &entry->addr, cur->addr.port || entry->addr.port)) { /* allow replacing the exiting endpoint only if such * endpoint is an implicit one and the user-space * did not provide an endpoint id */ if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) { ret = -EEXIST; goto out; } if (entry->addr.id) goto out; pernet->addrs--; entry->addr.id = cur->addr.id; list_del_rcu(&cur->list); del_entry = cur; break; } } if (!entry->addr.id && needs_id) { find_next: entry->addr.id = find_next_zero_bit(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, pernet->next_id); if (!entry->addr.id && pernet->next_id != 1) { pernet->next_id = 1; goto find_next; } } if (!entry->addr.id && needs_id) goto out; __set_bit(entry->addr.id, pernet->id_bitmap); if (entry->addr.id > pernet->next_id) pernet->next_id = entry->addr.id; if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { addr_max = pernet->add_addr_signal_max; WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; WRITE_ONCE(pernet->local_addr_max, addr_max + 1); } pernet->addrs++; if (!entry->addr.port) list_add_tail_rcu(&entry->list, &pernet->local_addr_list); else list_add_rcu(&entry->list, &pernet->local_addr_list); ret = entry->addr.id; out: spin_unlock_bh(&pernet->lock); /* just replaced an existing entry, free it */ if (del_entry) { synchronize_rcu(); __mptcp_pm_release_addr_entry(del_entry); } return ret; } static struct lock_class_key mptcp_slock_keys[2]; static struct lock_class_key mptcp_keys[2]; static int mptcp_pm_nl_create_listen_socket(struct sock *sk, struct mptcp_pm_addr_entry *entry) { bool is_ipv6 = sk->sk_family == AF_INET6; int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; struct sock *newsk, *ssk; int backlog = 1024; int err; err = sock_create_kern(sock_net(sk), entry->addr.family, SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk); if (err) return err; newsk = entry->lsk->sk; if (!newsk) return -EINVAL; /* The subflow socket lock is acquired in a nested to the msk one * in several places, even by the TCP stack, and this msk is a kernel * socket: lockdep complains. Instead of propagating the _nested * modifiers in several places, re-init the lock class for the msk * socket to an mptcp specific one. */ sock_lock_init_class_and_name(newsk, is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET", &mptcp_slock_keys[is_ipv6], is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET", &mptcp_keys[is_ipv6]); lock_sock(newsk); ssk = __mptcp_nmpc_sk(mptcp_sk(newsk)); release_sock(newsk); if (IS_ERR(ssk)) return PTR_ERR(ssk); mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (entry->addr.family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif if (ssk->sk_family == AF_INET) err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ssk->sk_family == AF_INET6) err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); #endif if (err) return err; /* We don't use mptcp_set_state() here because it needs to be called * under the msk socket lock. For the moment, that will not bring * anything more than only calling inet_sk_state_store(), because the * old status is known (TCP_CLOSE). */ inet_sk_state_store(newsk, TCP_LISTEN); lock_sock(ssk); WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true); err = __inet_listen_sk(ssk, backlog); if (!err) mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); release_sock(ssk); return err; } int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; int ret; pernet = pm_nl_get_pernet_from_msk(msk); rcu_read_lock(); entry = __lookup_addr(pernet, skc); ret = entry ? entry->addr.id : -1; rcu_read_unlock(); if (ret >= 0) return ret; /* address not found, add to local list */ entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->addr = *skc; entry->addr.id = 0; entry->addr.port = 0; entry->ifindex = 0; entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; entry->lsk = NULL; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true); if (ret < 0) kfree(entry); return ret; } bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); struct mptcp_pm_addr_entry *entry; bool backup; rcu_read_lock(); entry = __lookup_addr(pernet, skc); backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); rcu_read_unlock(); return backup; } #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 static const struct genl_multicast_group mptcp_pm_mcgrps[] = { [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, }, [MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME, .flags = GENL_MCAST_CAP_NET_ADMIN, }, }; void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; unsigned int active_max_loss_cnt; struct net *net = sock_net(sk); unsigned int stale_loss_cnt; bool slow; stale_loss_cnt = mptcp_stale_loss_cnt(net); if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) return; /* look for another available subflow not in loss state */ active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); mptcp_for_each_subflow(msk, iter) { if (iter != subflow && mptcp_subflow_active(iter) && iter->stale_count < active_max_loss_cnt) { /* we have some alternatives, try to mark this subflow as idle ...*/ slow = lock_sock_fast(ssk); if (!tcp_rtx_and_write_queues_empty(ssk)) { subflow->stale = 1; __mptcp_retransmit_pending_data(sk); MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); } unlock_sock_fast(ssk, slow); /* always try to push the pending data regardless of re-injections: * we can possibly use backup subflows now, and subflow selection * is cheap under the msk socket lock */ __mptcp_push_pending(sk, 0); return; } } } static int mptcp_pm_family_to_addr(int family) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (family == AF_INET6) return MPTCP_PM_ADDR_ATTR_ADDR6; #endif return MPTCP_PM_ADDR_ATTR_ADDR4; } static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], const struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr, bool require_family) { int err, addr_addr; if (!attr) { GENL_SET_ERR_MSG(info, "missing address info"); return -EINVAL; } /* no validation needed - was already done via nested policy */ err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, mptcp_pm_address_nl_policy, info->extack); if (err) return err; if (tb[MPTCP_PM_ADDR_ATTR_ID]) addr->id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { if (!require_family) return 0; NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing family"); return -EINVAL; } addr->family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); if (addr->family != AF_INET #if IS_ENABLED(CONFIG_MPTCP_IPV6) && addr->family != AF_INET6 #endif ) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "unknown address family"); return -EINVAL; } addr_addr = mptcp_pm_family_to_addr(addr->family); if (!tb[addr_addr]) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing address data"); return -EINVAL; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr->family == AF_INET6) addr->addr6 = nla_get_in6_addr(tb[addr_addr]); else #endif addr->addr.s_addr = nla_get_in_addr(tb[addr_addr]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); return 0; } int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; memset(addr, 0, sizeof(*addr)); return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true); } int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, bool require_family, struct mptcp_pm_addr_entry *entry) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; int err; memset(entry, 0, sizeof(*entry)); err = mptcp_pm_parse_pm_addr_attr(tb, attr, info, &entry->addr, require_family); if (err) return err; if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); entry->ifindex = val; } if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); return 0; } static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) { return pm_nl_get_pernet(genl_info_net(info)); } static int mptcp_nl_add_subflow_or_signal_addr(struct net *net, struct mptcp_addr_info *addr) { struct mptcp_sock *msk; long s_slot = 0, s_num = 0; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; struct mptcp_addr_info mpc_addr; if (!READ_ONCE(msk->fully_established) || mptcp_pm_is_userspace(msk)) goto next; /* if the endp linked to the init sf is re-added with a != ID */ mptcp_local_address((struct sock_common *)msk, &mpc_addr); lock_sock(sk); spin_lock_bh(&msk->pm.lock); if (mptcp_addresses_equal(addr, &mpc_addr, addr->port)) msk->mpc_endpoint_id = addr->id; mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); release_sock(sk); next: sock_put(sk); cond_resched(); } return 0; } static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, struct genl_info *info) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, mptcp_pm_address_nl_policy, info->extack) && tb[MPTCP_PM_ADDR_ATTR_ID]) return true; return false; } int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; int ret; ret = mptcp_pm_parse_entry(attr, info, true, &addr); if (ret < 0) return ret; if (addr.addr.port && !address_use_port(&addr)) { GENL_SET_ERR_MSG(info, "flags must have signal and not subflow when using port"); return -EINVAL; } if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh"); return -EINVAL; } if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint"); return -EINVAL; } entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT); if (!entry) { GENL_SET_ERR_MSG(info, "can't allocate addr"); return -ENOMEM; } *entry = addr; if (entry->addr.port) { ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); if (ret) { GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret); goto out_free; } } ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, !mptcp_pm_has_addr_attr_id(attr, info)); if (ret < 0) { GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); goto out_free; } mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr); return 0; out_free: __mptcp_pm_release_addr_entry(entry); return ret; } bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; entry = mptcp_pm_del_add_timer(msk, addr, false); if (entry) { kfree(entry); return true; } return false; } static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; } static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool force) { struct mptcp_rm_list list = { .nr = 0 }; bool ret; list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); ret = mptcp_remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); if (ret) { __set_bit(addr->id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled--; } mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } return ret; } static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) { /* If it was marked as used, and not ID 0, decrement local_addr_used */ if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) && id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0)) msk->pm.local_addr_used--; } static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, const struct mptcp_pm_addr_entry *entry) { const struct mptcp_addr_info *addr = &entry->addr; struct mptcp_rm_list list = { .nr = 1 }; long s_slot = 0, s_num = 0; struct mptcp_sock *msk; pr_debug("remove_id=%d\n", addr->id); while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; bool remove_subflow; if (mptcp_pm_is_userspace(msk)) goto next; if (list_empty(&msk->conn_list)) { mptcp_pm_remove_anno_addr(msk, addr, false); goto next; } lock_sock(sk); remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); list.ids[0] = mptcp_endp_get_local_id(msk, addr); if (remove_subflow) { spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, &list); spin_unlock_bh(&msk->pm.lock); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { spin_lock_bh(&msk->pm.lock); __mark_subflow_endp_available(msk, list.ids[0]); spin_unlock_bh(&msk->pm.lock); } if (msk->mpc_endpoint_id == entry->addr.id) msk->mpc_endpoint_id = 0; release_sock(sk); next: sock_put(sk); cond_resched(); } return 0; } static int mptcp_nl_remove_id_zero_address(struct net *net, struct mptcp_addr_info *addr) { struct mptcp_rm_list list = { .nr = 0 }; long s_slot = 0, s_num = 0; struct mptcp_sock *msk; list.ids[list.nr++] = 0; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; struct mptcp_addr_info msk_local; if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) goto next; mptcp_local_address((struct sock_common *)msk, &msk_local); if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) goto next; lock_sock(sk); spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &list); mptcp_pm_nl_rm_subflow_received(msk, &list); __mark_subflow_endp_available(msk, 0); spin_unlock_bh(&msk->pm.lock); release_sock(sk); next: sock_put(sk); cond_resched(); } return 0; } int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; unsigned int addr_max; int ret; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; /* the zero id address is special: the first address used by the msk * always gets such an id, so different subflows can have different zero * id addresses. Additionally zero id is not accounted for in id_bitmap. * Let's use an 'mptcp_rm_list' instead of the common remove code. */ if (addr.addr.id == 0) return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr); spin_lock_bh(&pernet->lock); entry = __lookup_addr_by_id(pernet, addr.addr.id); if (!entry) { GENL_SET_ERR_MSG(info, "address not found"); spin_unlock_bh(&pernet->lock); return -EINVAL; } if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { addr_max = pernet->add_addr_signal_max; WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; WRITE_ONCE(pernet->local_addr_max, addr_max - 1); } pernet->addrs--; list_del_rcu(&entry->list); __clear_bit(entry->addr.id, pernet->id_bitmap); spin_unlock_bh(&pernet->lock); mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); synchronize_rcu(); __mptcp_pm_release_addr_entry(entry); return ret; } static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, rm_list, list) { if (slist.nr < MPTCP_RM_IDS_MAX && mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); if (alist.nr < MPTCP_RM_IDS_MAX && mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); } spin_lock_bh(&msk->pm.lock); if (alist.nr) { msk->pm.add_addr_signaled -= alist.nr; mptcp_pm_remove_addr(msk, &alist); } if (slist.nr) mptcp_pm_nl_rm_subflow_received(msk, &slist); /* Reset counters: maybe some subflows have been removed before */ bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); msk->pm.local_addr_used = 0; spin_unlock_bh(&msk->pm.lock); } static void mptcp_nl_flush_addrs_list(struct net *net, struct list_head *rm_list) { long s_slot = 0, s_num = 0; struct mptcp_sock *msk; if (list_empty(rm_list)) return; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { lock_sock(sk); mptcp_pm_flush_addrs_and_subflows(msk, rm_list); release_sock(sk); } sock_put(sk); cond_resched(); } } /* caller must ensure the RCU grace period is already elapsed */ static void __flush_addrs(struct list_head *list) { while (!list_empty(list)) { struct mptcp_pm_addr_entry *cur; cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); list_del_rcu(&cur->list); __mptcp_pm_release_addr_entry(cur); } } static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->add_addr_signal_max, 0); WRITE_ONCE(pernet->add_addr_accept_max, 0); WRITE_ONCE(pernet->local_addr_max, 0); pernet->addrs = 0; } int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); list_splice_init(&pernet->local_addr_list, &free_list); __reset_counters(pernet); pernet->next_id = 1; bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list); synchronize_rcu(); __flush_addrs(&free_list); return 0; } int mptcp_nl_fill_addr(struct sk_buff *skb, struct mptcp_pm_addr_entry *entry) { struct mptcp_addr_info *addr = &entry->addr; struct nlattr *attr; attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR); if (!attr) return -EMSGSIZE; if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family)) goto nla_put_failure; if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_PORT, ntohs(addr->port))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags)) goto nla_put_failure; if (entry->ifindex && nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex)) goto nla_put_failure; if (addr->family == AF_INET && nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4, addr->addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6 && nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6)) goto nla_put_failure; #endif nla_nest_end(skb, attr); return 0; nla_put_failure: nla_nest_cancel(skb, attr); return -EMSGSIZE; } int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; struct sk_buff *msg; void *reply; int ret; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, info->genlhdr->cmd); if (!reply) { GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); ret = -EMSGSIZE; goto fail; } rcu_read_lock(); entry = __lookup_addr_by_id(pernet, addr.addr.id); if (!entry) { GENL_SET_ERR_MSG(info, "address not found"); ret = -EINVAL; goto unlock_fail; } ret = mptcp_nl_fill_addr(msg, entry); if (ret) goto unlock_fail; genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); rcu_read_unlock(); return ret; unlock_fail: rcu_read_unlock(); fail: nlmsg_free(msg); return ret; } int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) { return mptcp_pm_get_addr(skb, info); } int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) { struct net *net = sock_net(msg->sk); struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; int id = cb->args[0]; void *hdr; int i; pernet = pm_nl_get_pernet(net); rcu_read_lock(); for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { if (test_bit(i, pernet->id_bitmap)) { entry = __lookup_addr_by_id(pernet, i); if (!entry) break; if (entry->addr.id <= id) continue; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &mptcp_genl_family, NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); if (!hdr) break; if (mptcp_nl_fill_addr(msg, entry) < 0) { genlmsg_cancel(msg, hdr); break; } id = entry->addr.id; genlmsg_end(msg, hdr); } } rcu_read_unlock(); cb->args[0] = id; return msg->len; } int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { return mptcp_pm_dump_addr(msg, cb); } static int parse_limit(struct genl_info *info, int id, unsigned int *limit) { struct nlattr *attr = info->attrs[id]; if (!attr) return 0; *limit = nla_get_u32(attr); if (*limit > MPTCP_PM_ADDR_MAX) { GENL_SET_ERR_MSG(info, "limit greater than maximum"); return -EINVAL; } return 0; } int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); unsigned int rcv_addrs, subflows; int ret; spin_lock_bh(&pernet->lock); rcv_addrs = pernet->add_addr_accept_max; ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); if (ret) goto unlock; subflows = pernet->subflows_max; ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); if (ret) goto unlock; WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); WRITE_ONCE(pernet->subflows_max, subflows); unlock: spin_unlock_bh(&pernet->lock); return ret; } int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct sk_buff *msg; void *reply; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, MPTCP_PM_CMD_GET_LIMITS); if (!reply) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, READ_ONCE(pernet->add_addr_accept_max))) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, READ_ONCE(pernet->subflows_max))) goto fail; genlmsg_end(msg, reply); return genlmsg_reply(msg, info); fail: GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); nlmsg_free(msg); return -EMSGSIZE; } static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, struct mptcp_addr_info *addr) { struct mptcp_rm_list list = { .nr = 0 }; list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, &list); __mark_subflow_endp_available(msk, list.ids[0]); mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); } static int mptcp_nl_set_flags(struct net *net, struct mptcp_addr_info *addr, u8 bkup, u8 changed) { long s_slot = 0, s_num = 0; struct mptcp_sock *msk; int ret = -EINVAL; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) goto next; lock_sock(sk); if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) mptcp_pm_nl_fullmesh(msk, addr); release_sock(sk); next: sock_put(sk); cond_resched(); } return ret; } int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) { struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | MPTCP_PM_ADDR_FLAG_FULLMESH; struct net *net = sock_net(skb->sk); struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; u8 lookup_by_id = 0; u8 bkup = 0; int ret; pernet = pm_nl_get_pernet(net); ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; if (addr.addr.family == AF_UNSPEC) { lookup_by_id = 1; if (!addr.addr.id) { GENL_SET_ERR_MSG(info, "missing required inputs"); return -EOPNOTSUPP; } } if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; spin_lock_bh(&pernet->lock); entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) : __lookup_addr(pernet, &addr.addr); if (!entry) { spin_unlock_bh(&pernet->lock); GENL_SET_ERR_MSG(info, "address not found"); return -EINVAL; } if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { spin_unlock_bh(&pernet->lock); GENL_SET_ERR_MSG(info, "invalid addr flags"); return -EINVAL; } changed = (addr.flags ^ entry->flags) & mask; entry->flags = (entry->flags & ~mask) | (addr.flags & mask); addr = *entry; spin_unlock_bh(&pernet->lock); mptcp_nl_set_flags(net, &addr.addr, bkup, changed); return 0; } int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) { return mptcp_pm_set_flags(skb, info); } static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) { genlmsg_multicast_netns(&mptcp_genl_family, net, nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp); } bool mptcp_userspace_pm_active(const struct mptcp_sock *msk) { return genl_has_listeners(&mptcp_genl_family, sock_net((const struct sock *)msk), MPTCP_PM_EV_GRP_OFFSET); } static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) { const struct inet_sock *issk = inet_sk(ssk); const struct mptcp_subflow_context *sf; if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) return -EMSGSIZE; switch (ssk->sk_family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) return -EMSGSIZE; if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, issk->inet_daddr)) return -EMSGSIZE; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { const struct ipv6_pinfo *np = inet6_sk(ssk); if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) return -EMSGSIZE; if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr)) return -EMSGSIZE; break; } #endif default: WARN_ON_ONCE(1); return -EMSGSIZE; } if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) return -EMSGSIZE; if (nla_put_be16(skb, MPTCP_ATTR_DPORT, issk->inet_dport)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (WARN_ON_ONCE(!sf)) return -EINVAL; if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, subflow_get_local_id(sf))) return -EMSGSIZE; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id)) return -EMSGSIZE; return 0; } static int mptcp_event_put_token_and_ssk(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { const struct sock *sk = (const struct sock *)msk; const struct mptcp_subflow_context *sf; u8 sk_err; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) return -EMSGSIZE; if (mptcp_event_add_subflow(skb, ssk)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (WARN_ON_ONCE(!sf)) return -EINVAL; if (nla_put_u8(skb, MPTCP_ATTR_BACKUP, sf->backup)) return -EMSGSIZE; if (ssk->sk_bound_dev_if && nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if)) return -EMSGSIZE; sk_err = READ_ONCE(ssk->sk_err); if (sk_err && sk->sk_state == TCP_ESTABLISHED && nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err)) return -EMSGSIZE; return 0; } static int mptcp_event_sub_established(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { return mptcp_event_put_token_and_ssk(skb, msk, ssk); } static int mptcp_event_sub_closed(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { const struct mptcp_subflow_context *sf; if (mptcp_event_put_token_and_ssk(skb, msk, ssk)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (!sf->reset_seen) return 0; if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason)) return -EMSGSIZE; if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient)) return -EMSGSIZE; return 0; } static int mptcp_event_created(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); if (err) return err; if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) return -EMSGSIZE; return mptcp_event_add_subflow(skb, ssk); } void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id) { struct net *net = sock_net((const struct sock *)msk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_REMOVED); if (!nlh) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id)) goto nla_put_failure; genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_ANNOUNCED); if (!nlh) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id)) goto nla_put_failure; if (nla_put_be16(skb, MPTCP_ATTR_DPORT, info->port == 0 ? inet_sk(ssk)->inet_dport : info->port)) goto nla_put_failure; switch (info->family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, info->addr.s_addr)) goto nla_put_failure; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &info->addr6)) goto nla_put_failure; break; #endif default: WARN_ON_ONCE(1); goto nla_put_failure; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event) { const struct inet_sock *issk = inet_sk(ssk); struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, event); if (!nlh) goto nla_put_failure; if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) goto nla_put_failure; if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) goto nla_put_failure; switch (ssk->sk_family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) goto nla_put_failure; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { const struct ipv6_pinfo *np = inet6_sk(ssk); if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) goto nla_put_failure; break; } #endif default: WARN_ON_ONCE(1); goto nla_put_failure; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_KERNEL); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) { struct net *net = sock_net((const struct sock *)msk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, type); if (!nlh) goto nla_put_failure; switch (type) { case MPTCP_EVENT_UNSPEC: WARN_ON_ONCE(1); break; case MPTCP_EVENT_CREATED: case MPTCP_EVENT_ESTABLISHED: if (mptcp_event_created(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_CLOSED: if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)) < 0) goto nla_put_failure; break; case MPTCP_EVENT_ANNOUNCED: case MPTCP_EVENT_REMOVED: /* call mptcp_event_addr_announced()/removed instead */ WARN_ON_ONCE(1); break; case MPTCP_EVENT_SUB_ESTABLISHED: case MPTCP_EVENT_SUB_PRIORITY: if (mptcp_event_sub_established(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_SUB_CLOSED: if (mptcp_event_sub_closed(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_LISTENER_CREATED: case MPTCP_EVENT_LISTENER_CLOSED: break; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, gfp); return; nla_put_failure: nlmsg_free(skb); } struct genl_family mptcp_genl_family __ro_after_init = { .name = MPTCP_PM_NAME, .version = MPTCP_PM_VER, .netnsok = true, .module = THIS_MODULE, .ops = mptcp_pm_nl_ops, .n_ops = ARRAY_SIZE(mptcp_pm_nl_ops), .resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1, .mcgrps = mptcp_pm_mcgrps, .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), }; static int __net_init pm_nl_init_net(struct net *net) { struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); /* Cit. 2 subflows ought to be enough for anybody. */ pernet->subflows_max = 2; pernet->next_id = 1; pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); /* No need to initialize other pernet fields, the struct is zeroed at * allocation time. */ return 0; } static void __net_exit pm_nl_exit_net(struct list_head *net_list) { struct net *net; list_for_each_entry(net, net_list, exit_list) { struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); /* net is removed from namespace list, can't race with * other modifiers, also netns core already waited for a * RCU grace period. */ __flush_addrs(&pernet->local_addr_list); } } static struct pernet_operations mptcp_pm_pernet_ops = { .init = pm_nl_init_net, .exit_batch = pm_nl_exit_net, .id = &pm_nl_pernet_id, .size = sizeof(struct pm_nl_pernet), }; void __init mptcp_pm_nl_init(void) { if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) panic("Failed to register MPTCP PM pernet subsystem.\n"); if (genl_register_family(&mptcp_genl_family)) panic("Failed to register MPTCP PM netlink family\n"); }
311 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tlb #if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TLB_H #include <linux/mm_types.h> #include <linux/tracepoint.h> #define TLB_FLUSH_REASON \ EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \ EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \ EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \ EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \ EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" ) /* * First define the enums in TLB_FLUSH_REASON to be exported to userspace * via TRACE_DEFINE_ENUM(). */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); TLB_FLUSH_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } TRACE_EVENT(tlb_flush, TP_PROTO(int reason, unsigned long pages), TP_ARGS(reason, pages), TP_STRUCT__entry( __field( int, reason) __field(unsigned long, pages) ), TP_fast_assign( __entry->reason = reason; __entry->pages = pages; ), TP_printk("pages:%ld reason:%s (%d)", __entry->pages, __print_symbolic(__entry->reason, TLB_FLUSH_REASON), __entry->reason) ); #endif /* _TRACE_TLB_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 /* SPDX-License-Identifier: GPL-2.0 */ /* * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.rst for more details. */ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H #include <linux/lockdep_types.h> #include <linux/smp.h> #include <asm/percpu.h> struct task_struct; #ifdef CONFIG_LOCKDEP #include <linux/linkage.h> #include <linux/list.h> #include <linux/debug_locks.h> #include <linux/stacktrace.h> static inline void lockdep_copy_map(struct lockdep_map *to, struct lockdep_map *from) { int i; *to = *from; /* * Since the class cache can be modified concurrently we could observe * half pointers (64bit arch using 32bit copy insns). Therefore clear * the caches and take the performance hit. * * XXX it doesn't work well with lockdep_set_class_and_subclass(), since * that relies on cache abuse. */ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) to->class_cache[i] = NULL; } /* * Every lock has a list of other locks that were taken after it. * We only grow the list, never remove from it: */ struct lock_list { struct list_head entry; struct lock_class *class; struct lock_class *links_to; const struct lock_trace *trace; u16 distance; /* bitmap of different dependencies from head to this */ u8 dep; /* used by BFS to record whether "prev -> this" only has -(*R)-> */ u8 only_xr; /* * The parent field is used to implement breadth-first search, and the * bit 0 is reused to indicate if the lock has been accessed in BFS. */ struct lock_list *parent; }; /** * struct lock_chain - lock dependency chain record * * @irq_context: the same as irq_context in held_lock below * @depth: the number of held locks in this chain * @base: the index in chain_hlocks for this chain * @entry: the collided lock chains in lock_chain hash list * @chain_key: the hash key of this lock_chain */ struct lock_chain { /* see BUILD_BUG_ON()s in add_chain_cache() */ unsigned int irq_context : 2, depth : 6, base : 24; /* 4 byte hole */ struct hlist_node entry; u64 chain_key; }; /* * Initialization, self-test and debugging-output methods: */ extern void lockdep_init(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); extern asmlinkage void lockdep_sys_exit(void); extern void lockdep_set_selftest_task(struct task_struct *task); extern void lockdep_init_task(struct task_struct *task); /* * Split the recursion counter in two to readily detect 'off' vs recursion. */ #define LOCKDEP_RECURSION_BITS 16 #define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) #define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) /* * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due * to header dependencies. */ #define lockdep_off() \ do { \ current->lockdep_recursion += LOCKDEP_OFF; \ } while (0) #define lockdep_on() \ do { \ current->lockdep_recursion -= LOCKDEP_OFF; \ } while (0) extern void lockdep_register_key(struct lock_class_key *key); extern void lockdep_unregister_key(struct lock_class_key *key); /* * These methods are used by specific locking variants (spinlocks, * rwlocks, mutexes and rwsems) to pass init/acquire/release events * to lockdep: */ extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type); static inline void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer) { lockdep_init_map_type(lock, name, key, subclass, inner, outer, LD_LOCK_NORMAL); } static inline void lockdep_init_map_wait(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner) { lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV); } static inline void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { lockdep_init_map_wait(lock, name, key, subclass, LD_WAIT_INV); } /* * Reinitialize a lock key - for cases where there is special locking or * special initialization of locks so that the validator gets the scope * of dependencies wrong: they are either too broad (they need a class-split) * or they are too narrow (they suffer from a false class-split): */ #define lockdep_set_class(lock, key) \ lockdep_init_map_type(&(lock)->dep_map, #key, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_class_and_name(lock, key, name) \ lockdep_init_map_type(&(lock)->dep_map, name, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_class_and_subclass(lock, key, sub) \ lockdep_init_map_type(&(lock)->dep_map, #key, key, sub, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_subclass(lock, sub) \ lockdep_init_map_type(&(lock)->dep_map, (lock)->dep_map.name, (lock)->dep_map.key, sub,\ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) /** * lockdep_set_novalidate_class: disable checking of lock ordering on a given * lock * @lock: Lock to mark * * Lockdep will still record that this lock has been taken, and print held * instances when dumping locks */ #define lockdep_set_novalidate_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock) /** * lockdep_set_notrack_class: disable lockdep tracking of a given lock entirely * @lock: Lock to mark * * Bigger hammer than lockdep_set_novalidate_class: so far just for bcachefs, * which takes more locks than lockdep is able to track (48). */ #define lockdep_set_notrack_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_track__, #lock) /* * Compare locking classes */ #define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key) static inline int lockdep_match_key(struct lockdep_map *lock, struct lock_class_key *key) { return lock->key == key; } /* * Acquire a lock. * * Values for "read": * * 0: exclusive (write) acquire * 1: read-acquire (no recursion allowed) * 2: read-acquire with same-instance recursion allowed * * Values for check: * * 0: simple checks (freeing, held-at-exit-time, etc.) * 1: full validation */ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); extern void lock_release(struct lockdep_map *lock, unsigned long ip); extern void lock_sync(struct lockdep_map *lock, unsigned int subclass, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); /* lock_is_held_type() returns */ #define LOCK_STATE_UNKNOWN -1 #define LOCK_STATE_NOT_HELD 0 #define LOCK_STATE_HELD 1 /* * Same "read" as for lock_acquire(), except -1 means any. */ extern int lock_is_held_type(const struct lockdep_map *lock, int read); static inline int lock_is_held(const struct lockdep_map *lock) { return lock_is_held_type(lock, -1); } #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) #define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r)) extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip); #define lock_set_novalidate_class(l, n, i) \ lock_set_class(l, n, &__lockdep_no_validate__, 0, i) static inline void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, unsigned long ip) { lock_set_class(lock, lock->name, lock->key, subclass, ip); } extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) #define lockdep_assert(cond) \ do { WARN_ON(debug_locks && !(cond)); } while (0) #define lockdep_assert_once(cond) \ do { WARN_ON_ONCE(debug_locks && !(cond)); } while (0) #define lockdep_assert_held(l) \ lockdep_assert(lockdep_is_held(l) != LOCK_STATE_NOT_HELD) #define lockdep_assert_not_held(l) \ lockdep_assert(lockdep_is_held(l) != LOCK_STATE_HELD) #define lockdep_assert_held_write(l) \ lockdep_assert(lockdep_is_held_type(l, 0)) #define lockdep_assert_held_read(l) \ lockdep_assert(lockdep_is_held_type(l, 1)) #define lockdep_assert_held_once(l) \ lockdep_assert_once(lockdep_is_held(l) != LOCK_STATE_NOT_HELD) #define lockdep_assert_none_held_once() \ lockdep_assert_once(!current->lockdep_depth) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) #define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) /* * Must use lock_map_aquire_try() with override maps to avoid * lockdep thinking they participate in the block chain. */ #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map _name = { \ .name = #_name "-wait-type-override", \ .wait_type_inner = _wait_type, \ .lock_type = LD_LOCK_WAIT_OVERRIDE, } #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) { } static inline void lockdep_off(void) { } static inline void lockdep_on(void) { } static inline void lockdep_set_selftest_task(struct task_struct *task) { } # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, i) do { } while (0) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, key, s, i) do { (void)(key); } while (0) # define lock_set_novalidate_class(l, n, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_wait(lock, name, key, sub, inner) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) # define lockdep_set_class_and_name(lock, key, name) \ do { (void)(key); (void)(name); } while (0) #define lockdep_set_class_and_subclass(lock, key, sub) \ do { (void)(key); } while (0) #define lockdep_set_subclass(lock, sub) do { } while (0) #define lockdep_set_novalidate_class(lock) do { } while (0) #define lockdep_set_notrack_class(lock) do { } while (0) /* * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP * case since the result is not well defined and the caller should rather * #ifdef the call himself. */ # define lockdep_reset() do { debug_locks = 1; } while (0) # define lockdep_free_key_range(start, size) do { } while (0) # define lockdep_sys_exit() do { } while (0) static inline void lockdep_register_key(struct lock_class_key *key) { } static inline void lockdep_unregister_key(struct lock_class_key *key) { } #define lockdep_depth(tsk) (0) /* * Dummy forward declarations, allow users to write less ifdef-y code * and depend on dead code elimination. */ extern int lock_is_held(const void *); extern int lockdep_is_held(const void *); #define lockdep_is_held_type(l, r) (1) #define lockdep_assert(c) do { } while (0) #define lockdep_assert_once(c) do { } while (0) #define lockdep_assert_held(l) do { (void)(l); } while (0) #define lockdep_assert_not_held(l) do { (void)(l); } while (0) #define lockdep_assert_held_write(l) do { (void)(l); } while (0) #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_assert_none_held_once() do { } while (0) #define lockdep_recursing(tsk) (0) #define NIL_COOKIE (struct pin_cookie){ } #define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; }) #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map __maybe_unused _name = {} #endif /* !LOCKDEP */ #ifdef CONFIG_PROVE_LOCKING void lockdep_set_lock_cmp_fn(struct lockdep_map *, lock_cmp_fn, lock_print_fn); #define lock_set_cmp_fn(lock, ...) lockdep_set_lock_cmp_fn(&(lock)->dep_map, __VA_ARGS__) #else #define lock_set_cmp_fn(lock, ...) do { } while (0) #endif enum xhlock_context_t { XHLOCK_HARD, XHLOCK_SOFT, XHLOCK_CTX_NR, }; /* * To initialize a lockdep_map statically use this macro. * Note that _name must not be NULL. */ #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_free_task(struct task_struct *task) {} #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); extern void lock_acquired(struct lockdep_map *lock, unsigned long ip); #define LOCK_CONTENDED(_lock, try, lock) \ do { \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ } \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ ({ \ int ____err = 0; \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ ____err = lock(_lock); \ } \ if (!____err) \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ ____err; \ }) #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) #define lock_acquired(lockdep_map, ip) do {} while (0) #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ lock(_lock) #endif /* CONFIG_LOCK_STAT */ #ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else static inline void print_irqtrace_events(struct task_struct *curr) { } #endif /* Variable used to make lockdep treat read_lock() as recursive in selftests */ #ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS extern unsigned int force_read_lock_recursive; #else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #define force_read_lock_recursive 0 #endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #ifdef CONFIG_LOCKDEP extern bool read_lock_is_recursive(void); #else /* CONFIG_LOCKDEP */ /* If !LOCKDEP, the value is meaningless */ #define read_lock_is_recursive() 0 #endif /* * For trivial one-depth nesting of a lock-class, the following * global define can be used. (Subsystems with multiple levels * of nesting should define their own lock-nesting subclasses.) */ #define SINGLE_DEPTH_NESTING 1 /* * Map the dependency ops to NOP or to real lockdep ops, depending * on the per lock-class debug mode: */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, i) lock_release(l, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwlock_acquire_read(l, s, t, i) \ do { \ if (read_lock_is_recursive()) \ lock_acquire_shared_recursive(l, s, t, NULL, i); \ else \ lock_acquire_shared(l, s, t, NULL, i); \ } while (0) #define rwlock_release(l, i) lock_release(l, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define seqcount_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) #define seqcount_release(l, i) lock_release(l, i) #define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define mutex_release(l, i) lock_release(l, i) #define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) #define rwsem_release(l, i) lock_release(l, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) #define lock_map_sync(l) lock_sync(l, 0, 0, 1, NULL, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_read(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_nested(lock, subclass) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL, \ _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); DECLARE_PER_CPU(unsigned int, lockdep_recursion); #define __lockdep_enabled (debug_locks && !this_cpu_read(lockdep_recursion)) #define lockdep_assert_irqs_enabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_irqs_disabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_in_irq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \ } while (0) #define lockdep_assert_no_hardirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && (this_cpu_read(hardirq_context) || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_enabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() != 0 || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_disabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() == 0 && \ this_cpu_read(hardirqs_enabled))); \ } while (0) /* * Acceptable for protecting per-CPU resources accessed from BH. * Much like in_softirq() - semantics are ambiguous, use carefully. */ #define lockdep_assert_in_softirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && \ (!in_softirq() || in_irq() || in_nmi())); \ } while (0) extern void lockdep_assert_in_softirq_func(void); #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define might_lock_nested(lock, subclass) do { } while (0) # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) # define lockdep_assert_no_hardirq() do { } while (0) # define lockdep_assert_preemption_enabled() do { } while (0) # define lockdep_assert_preemption_disabled() do { } while (0) # define lockdep_assert_in_softirq() do { } while (0) # define lockdep_assert_in_softirq_func() do { } while (0) #endif #ifdef CONFIG_PROVE_RAW_LOCK_NESTING # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ lockdep_hardirq_context() && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) #else # define lockdep_assert_RT_in_threaded_ctx() do { } while (0) #endif #ifdef CONFIG_LOCKDEP void lockdep_rcu_suspicious(const char *file, const int line, const char *s); #else static inline void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { } #endif #endif /* __LINUX_LOCKDEP_H */
65 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 // SPDX-License-Identifier: GPL-2.0-only /* * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/slab.h> #include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, }; static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ static bool forward __read_mostly = true; module_param(forward, bool, 0000); static int iptable_filter_table_init(struct net *net) { struct ipt_replace *repl; int err; repl = ipt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ipt_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : NF_DROP - 1; err = ipt_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); return err; } static int __net_init iptable_filter_net_init(struct net *net) { if (!forward) return iptable_filter_table_init(net); return 0; } static void __net_exit iptable_filter_net_pre_exit(struct net *net) { ipt_unregister_table_pre_exit(net, "filter"); } static void __net_exit iptable_filter_net_exit(struct net *net) { ipt_unregister_table_exit(net, "filter"); } static struct pernet_operations iptable_filter_net_ops = { .init = iptable_filter_net_init, .pre_exit = iptable_filter_net_pre_exit, .exit = iptable_filter_net_exit, }; static int __init iptable_filter_init(void) { int ret = xt_register_template(&packet_filter, iptable_filter_table_init); if (ret < 0) return ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table); if (IS_ERR(filter_ops)) { xt_unregister_template(&packet_filter); return PTR_ERR(filter_ops); } ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0) { xt_unregister_template(&packet_filter); kfree(filter_ops); return ret; } return 0; } static void __exit iptable_filter_fini(void) { unregister_pernet_subsys(&iptable_filter_net_ops); xt_unregister_template(&packet_filter); kfree(filter_ops); } module_init(iptable_filter_init); module_exit(iptable_filter_fini);
34 159 784 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* fs/ internal definitions * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ struct super_block; struct file_system_type; struct iomap; struct iomap_ops; struct linux_binprm; struct path; struct mount; struct shrink_control; struct fs_context; struct pipe_inode_info; struct iov_iter; struct mnt_idmap; struct ns_common; /* * block/bdev.c */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); #else static inline void bdev_cache_init(void) { } #endif /* CONFIG_BLOCK */ /* * buffer.c */ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap); /* * char_dev.c */ extern void __init chrdev_init(void); /* * fs_context.c */ extern const struct fs_context_operations legacy_fs_context_ops; extern int parse_monolithic_mount_data(struct fs_context *, void *); extern void vfs_clean_context(struct fs_context *fc); extern int finish_clean_context(struct fs_context *fc); /* * namei.c */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); int may_linkat(struct mnt_idmap *idmap, const struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); int do_mkdirat(int dfd, struct filename *name, umode_t mode); int do_symlinkat(struct filename *from, int newdfd, struct filename *to); int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags); int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode); /* * namespace.c */ extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, const struct path *); extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); int mnt_get_write_access_file(struct file *file); void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page); int path_umount(struct path *path, int flags); int show_path(struct seq_file *m, struct dentry *root); /* * fs_struct.c */ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); static inline void file_put_write_access(struct file *file) { put_write_access(file->f_inode); mnt_put_write_access(file->f_path.mnt); if (unlikely(file->f_mode & FMODE_BACKING)) mnt_put_write_access(backing_file_user_path(file)->mnt); } static inline void put_file_access(struct file *file) { if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_dec(file->f_inode); } else if (file->f_mode & FMODE_WRITER) { file_put_write_access(file); } } /* * super.c */ extern int reconfigure_super(struct fs_context *); extern bool super_trylock_shared(struct super_block *sb); struct super_block *user_get_super(dev_t, bool excl); void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); int sb_init_dio_done_wq(struct super_block *sb); /* * Prepare superblock for changing its read-only state (i.e., either remount * read-write superblock read-only or vice versa). After this function returns * mnt_is_readonly() will return true for any mount of the superblock if its * caller is able to observe any changes done by the remount. This holds until * sb_end_ro_state_change() is called. */ static inline void sb_start_ro_state_change(struct super_block *sb) { WRITE_ONCE(sb->s_readonly_remount, 1); /* * For RO->RW transition, the barrier pairs with the barrier in * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY * cleared, it will see s_readonly_remount set. * For RW->RO transition, the barrier pairs with the barrier in * mnt_get_write_access() before the mnt_is_readonly() check. * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD * already cleared, it will see s_readonly_remount set. */ smp_wmb(); } /* * Ends section changing read-only state of the superblock. After this function * returns if mnt_is_readonly() returns false, the caller will be able to * observe all the changes remount did to the superblock. */ static inline void sb_end_ro_state_change(struct super_block *sb) { /* * This barrier provides release semantics that pairs with * the smp_rmb() acquire semantics in mnt_is_readonly(). * This barrier pair ensure that when mnt_is_readonly() sees * 0 for sb->s_readonly_remount, it will also see all the * preceding flag changes that were made during the RO state * change. */ smp_wmb(); WRITE_ONCE(sb->s_readonly_remount, 0); } /* * open.c */ struct open_flags { int open_flag; umode_t mode; int acc_mode; int intent; int lookup_flags; }; extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); long do_ftruncate(struct file *file, loff_t length, int small); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); int chown_common(const struct path *path, uid_t user, gid_t group); extern int vfs_open(const struct path *, struct file *); /* * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); /* * fs-writeback.c */ extern long get_nr_dirty_inodes(void); void invalidate_inodes(struct super_block *sb); /* * dcache.c */ extern int d_set_mounted(struct dentry *dentry); extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); extern struct dentry *d_alloc_cursor(struct dentry *); extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *); extern char *simple_dname(struct dentry *, char *, int); extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); extern void shrink_dcache_for_umount(struct super_block *); extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); extern void d_genocide(struct dentry *); /* * pipe.c */ extern const struct file_operations pipefifo_fops; /* * fs_pin.c */ extern void group_pin_kill(struct hlist_head *p); extern void mnt_pin_kill(struct mount *m); /* * fs/nsfs.c */ extern const struct dentry_operations ns_dentry_operations; int open_namespace(struct ns_common *ns); /* * fs/stat.c: */ int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer); int do_statx_fd(int fd, unsigned int flags, unsigned int mask, struct statx __user *buffer); /* * fs/splice.c: */ ssize_t splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags); /* * fs/xattr.c: */ struct xattr_name { char name[XATTR_NAME_MAX + 1]; }; struct kernel_xattr_ctx { /* Value of attribute */ union { const void __user *cvalue; void __user *value; }; void *kvalue; size_t size; /* Attribute name */ struct xattr_name *kname; unsigned int flags; }; ssize_t file_getxattr(struct file *file, struct kernel_xattr_ctx *ctx); ssize_t filename_getxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx); int file_setxattr(struct file *file, struct kernel_xattr_ctx *ctx); int filename_setxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx); int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx); int import_xattr_name(struct xattr_name *kname, const char __user *name); int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size); ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size); #else static inline int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size) { return -EOPNOTSUPP; } static inline ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size) { return -EOPNOTSUPP; } #endif ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); /* * fs/attr.c */ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); struct stashed_operations { void (*put_data)(void *data); int (*init_inode)(struct inode *inode, void *data); }; int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); /** * path_mounted - check whether path is mounted * @path: path to check * * Determine whether @path refers to the root of a mount. * * Return: true if @path is the root of a mount, false if not. */ static inline bool path_mounted(const struct path *path) { return path->mnt->mnt_root == path->dentry; } void file_f_owner_release(struct file *file);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2020 ARM Ltd. */ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H #ifndef __ASSEMBLY__ /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ static __always_inline void rep_nop(void) { asm volatile("rep; nop" ::: "memory"); } static __always_inline void cpu_relax(void) { rep_nop(); } struct getcpu_cache; notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); #endif /* __ASSEMBLY__ */ #endif /* __ASM_VDSO_PROCESSOR_H */
29 29 29 29 29 29 29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/plist.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/freezer.h> #include "futex.h" /* * READ this before attempting to hack on futexes! * * Basic futex operation and ordering guarantees * ============================================= * * The waiter reads the futex value in user space and calls * futex_wait(). This function computes the hash bucket and acquires * the hash bucket lock. After that it reads the futex user space value * again and verifies that the data has not changed. If it has not changed * it enqueues itself into the hash bucket, releases the hash bucket lock * and schedules. * * The waker side modifies the user space value of the futex and calls * futex_wake(). This function computes the hash bucket and acquires the * hash bucket lock. Then it looks for waiters on that futex in the hash * bucket and wakes them. * * In futex wake up scenarios where no tasks are blocked on a futex, taking * the hb spinlock can be avoided and simply return. In order for this * optimization to work, ordering guarantees must exist so that the waiter * being added to the list is acknowledged when the list is concurrently being * checked by the waker, avoiding scenarios like the following: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * uval = *futex; * *futex = newval; * sys_futex(WAKE, futex); * futex_wake(futex); * if (queue_empty()) * return; * if (uval == val) * lock(hash_bucket(futex)); * queue(); * unlock(hash_bucket(futex)); * schedule(); * * This would cause the waiter on CPU 0 to wait forever because it * missed the transition of the user space value from val to newval * and the waker did not find the waiter in the hash bucket queue. * * The correct serialization ensures that a waiter either observes * the changed user space value before blocking or is woken by a * concurrent waker: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * * waiters++; (a) * smp_mb(); (A) <-- paired with -. * | * lock(hash_bucket(futex)); | * | * uval = *futex; | * | *futex = newval; * | sys_futex(WAKE, futex); * | futex_wake(futex); * | * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); * schedule(); if (waiters) * lock(hash_bucket(futex)); * else wake_waiters(futex); * waiters--; (b) unlock(hash_bucket(futex)); * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write * to futex and the waiters read (see futex_hb_waiters_pending()). * * This yields the following case (where X:=waiters, Y:=futex): * * X = Y = 0 * * w[X]=1 w[Y]=1 * MB MB * r[Y]=y r[X]=x * * Which guarantees that x==0 && y==0 is impossible; which translates back into * the guarantee that we cannot both miss the futex variable change and the * enqueue. * * Note that a new waiter is accounted for in (a) even when it is possible that * the wait call can return error, in which case we backtrack from it in (b). * Refer to the comment in futex_q_lock(). * * Similarly, in order to account for waiters being requeued on another * address we always increment the waiters for the destination bucket before * acquiring the lock. It then decrements them again after releasing it - * the code that actually moves the futex(es) between hash buckets (requeue_futex) * will do the additional required waiter count housekeeping. This is done for * double_lock_hb() and double_unlock_hb(), respectively. */ bool __futex_wake_mark(struct futex_q *q) { if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return false; __futex_unqueue(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL * is written, without taking any locks. This is possible in the event * of a spurious wakeup, for example. A memory barrier is required here * to prevent the following store to lock_ptr from getting ahead of the * plist_del in __futex_unqueue(). */ smp_store_release(&q->lock_ptr, NULL); return true; } /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. Callers * must ensure to later call wake_up_q() for the actual * wakeups to occur. */ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) { struct task_struct *p = q->task; get_task_struct(p); if (!__futex_wake_mark(q)) { put_task_struct(p); return; } /* * Queue the task for later wakeup for after we've released * the hb->lock. */ wake_q_add_safe(wake_q, p); } /* * Wake up waiters matching bitset queued on this futex (uaddr). */ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; DEFINE_WAKE_Q(wake_q); int ret; if (!bitset) return -EINVAL; ret = get_futex_key(uaddr, flags, &key, FUTEX_READ); if (unlikely(ret != 0)) return ret; if ((flags & FLAGS_STRICT) && !nr_wake) return 0; hb = futex_hash(&key); /* Make sure we really have tasks to wakeup */ if (!futex_hb_waiters_pending(hb)) return ret; spin_lock(&hb->lock); plist_for_each_entry_safe(this, next, &hb->chain, list) { if (futex_match (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } /* Check if one of the bits is set in both bitsets */ if (!(this->bitset & bitset)) continue; this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } spin_unlock(&hb->lock); wake_up_q(&wake_q); return ret; } static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) { unsigned int op = (encoded_op & 0x70000000) >> 28; unsigned int cmp = (encoded_op & 0x0f000000) >> 24; int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { if (oparg < 0 || oparg > 31) { /* * kill this print and return -EINVAL when userspace * is sane again */ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", current->comm, oparg); oparg &= 31; } oparg = 1 << oparg; } pagefault_disable(); ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); pagefault_enable(); if (ret) return ret; switch (cmp) { case FUTEX_OP_CMP_EQ: return oldval == cmparg; case FUTEX_OP_CMP_NE: return oldval != cmparg; case FUTEX_OP_CMP_LT: return oldval < cmparg; case FUTEX_OP_CMP_GE: return oldval >= cmparg; case FUTEX_OP_CMP_LE: return oldval <= cmparg; case FUTEX_OP_CMP_GT: return oldval > cmparg; default: return -ENOSYS; } } /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; DEFINE_WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) return ret; hb1 = futex_hash(&key1); hb2 = futex_hash(&key2); retry_private: double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { double_unlock_hb(hb1, hb2); if (!IS_ENABLED(CONFIG_MMU) || unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { /* * we don't get EFAULT from MMU faults if we don't have * an MMU, but we might get them from range checking */ ret = op_ret; return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) return ret; } cond_resched(); if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (futex_match (&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } if (op_ret > 0) { op_ret = 0; plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (futex_match (&this->key, &key2)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++op_ret >= nr_wake2) break; } } ret += op_ret; } out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); return ret; } static long futex_wait_restart(struct restart_block *restart); /** * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal * @hb: the futex hash bucket, must be locked by the caller * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout */ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) { /* * The task state is guaranteed to be set before another task can * wake it. set_current_state() is implemented using smp_store_mb() and * futex_queue() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb); /* Arm the timer */ if (timeout) hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); /* * If we have been removed from the hash list, then another task * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { /* * If the timer has already expired, current will already be * flagged for rescheduling. Only call schedule if there * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) schedule(); } __set_current_state(TASK_RUNNING); } /** * futex_unqueue_multiple - Remove various futexes from their hash bucket * @v: The list of futexes to unqueue * @count: Number of futexes in the list * * Helper to unqueue a list of futexes. This can't fail. * * Return: * - >=0 - Index of the last futex that was awoken; * - -1 - No futex was awoken */ int futex_unqueue_multiple(struct futex_vector *v, int count) { int ret = -1, i; for (i = 0; i < count; i++) { if (!futex_unqueue(&v[i].q)) ret = i; } return ret; } /** * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes * @vs: The futex list to wait on * @count: The size of the list * @woken: Index of the last woken futex, if any. Used to notify the * caller that it can return this index to userspace (return parameter) * * Prepare multiple futexes in a single step and enqueue them. This may fail if * the futex list is invalid or if any futex was already awoken. On success the * task is ready to interruptible sleep. * * Return: * - 1 - One of the futexes was woken by another thread * - 0 - Success * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL */ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) { struct futex_hash_bucket *hb; bool retry = false; int ret, i; u32 uval; /* * Enqueuing multiple futexes is tricky, because we need to enqueue * each futex on the list before dealing with the next one to avoid * deadlocking on the hash bucket. But, before enqueuing, we need to * make sure that current->state is TASK_INTERRUPTIBLE, so we don't * lose any wake events, which cannot be done before the get_futex_key * of the next key, because it calls get_user_pages, which can sleep. * Thus, we fetch the list of futexes keys in two steps, by first * pinning all the memory keys in the futex key, and only then we read * each key and queue the corresponding futex. * * Private futexes doesn't need to recalculate hash in retry, so skip * get_futex_key() when retrying. */ retry: for (i = 0; i < count; i++) { if (!(vs[i].w.flags & FLAGS_SHARED) && retry) continue; ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), vs[i].w.flags, &vs[i].q.key, FUTEX_READ); if (unlikely(ret)) return ret; } set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; struct futex_q *q = &vs[i].q; u32 val = vs[i].w.val; hb = futex_q_lock(q); ret = futex_get_value_locked(&uval, uaddr); if (!ret && uval == val) { /* * The bucket lock can't be held while dealing with the * next futex. Queue each futex at this moment so hb can * be unlocked. */ futex_queue(q, hb); continue; } futex_q_unlock(hb); __set_current_state(TASK_RUNNING); /* * Even if something went wrong, if we find out that a futex * was woken, we don't return error and return this index to * userspace */ *woken = futex_unqueue_multiple(vs, i); if (*woken >= 0) return 1; if (ret) { /* * If we need to handle a page fault, we need to do so * without any lock and any enqueued futex (otherwise * we could lose some wakeup). So we do it here, after * undoing all the work done so far. In success, we * retry all the work. */ if (get_user(uval, uaddr)) return -EFAULT; retry = true; goto retry; } if (uval != val) return -EWOULDBLOCK; } return 0; } /** * futex_sleep_multiple - Check sleeping conditions and sleep * @vs: List of futexes to wait for * @count: Length of vs * @to: Timeout * * Sleep if and only if the timeout hasn't expired and no futex on the list has * been woken up. */ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { if (to && !to->task) return; for (; count; count--, vs++) { if (!READ_ONCE(vs->q.lock_ptr)) return; } schedule(); } /** * futex_wait_multiple - Prepare to wait on and enqueue several futexes * @vs: The list of futexes to wait on * @count: The number of objects * @to: Timeout before giving up and returning to userspace * * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function * sleeps on a group of futexes and returns on the first futex that is * wake, or after the timeout has elapsed. * * Return: * - >=0 - Hint to the futex that was awoken * - <0 - On error */ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { int ret, hint = 0; if (to) hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); while (1) { ret = futex_wait_multiple_setup(vs, count, &hint); if (ret) { if (ret > 0) { /* A futex was woken during setup */ ret = hint; } return ret; } futex_sleep_multiple(vs, count, to); __set_current_state(TASK_RUNNING); ret = futex_unqueue_multiple(vs, count); if (ret >= 0) return ret; if (to && !to->task) return -ETIMEDOUT; else if (signal_pending(current)) return -ERESTARTSYS; /* * The final case is a spurious wakeup, for * which just retry. */ } } /** * futex_wait_setup() - Prepare to wait on a futex * @uaddr: the futex userspace address * @val: the expected value * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q * @hb: storage for hash_bucket pointer to be returned to caller * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. * Return with the hb lock held on success, and unlocked on failure. * * Return: * - 0 - uaddr contains val and hb has been locked; * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) { u32 uval; int ret; /* * Access the page AFTER the hash-bucket is locked. * Order is important: * * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for * any cond. If we locked the hash-bucket after testing *uaddr, that * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * * On the other hand, we insert q and release the hash-bucket only * after testing *uaddr. This guarantees that futex_wait() will NOT * absorb a wakeup if *uaddr does not match the desired values * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; retry_private: *hb = futex_q_lock(q); ret = futex_get_value_locked(&uval, uaddr); if (ret) { futex_q_unlock(*hb); ret = get_user(uval, uaddr); if (ret) return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } if (uval != val) { futex_q_unlock(*hb); ret = -EWOULDBLOCK; } return ret; } int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset) { struct futex_q q = futex_q_init; struct futex_hash_bucket *hb; int ret; if (!bitset) return -EINVAL; q.bitset = bitset; retry: /* * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ futex_wait_queue(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ if (!futex_unqueue(&q)) return 0; if (to && !to->task) return -ETIMEDOUT; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ if (!signal_pending(current)) goto retry; return -ERESTARTSYS; } int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { struct hrtimer_sleeper timeout, *to; struct restart_block *restart; int ret; to = futex_setup_timer(abs_time, &timeout, flags, current->timer_slack_ns); ret = __futex_wait(uaddr, flags, val, to, bitset); /* No timeout, nothing to clean up. */ if (!to) return ret; hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); if (ret == -ERESTARTSYS) { restart = &current->restart_block; restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = *abs_time; restart->futex.bitset = bitset; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; return set_restart_fn(restart, futex_wait_restart); } return ret; } static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { t = restart->futex.time; tp = &t; } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val, tp, restart->futex.bitset); }
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 /* SPDX-License-Identifier: GPL-2.0-or-later */ #include <net/inet_common.h> enum linux_mptcp_mib_field { MPTCP_MIB_NUM = 0, MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEACTIVE, /* Sent SYN with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEACTIVEACK, /* Received SYN/ACK with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEACTIVEDROP, /* Client-side fallback due to a MPC drop */ MPTCP_MIB_MPCAPABLEACTIVEDISABLED, /* Client-side disabled due to past issues */ MPTCP_MIB_MPCAPABLEENDPATTEMPT, /* Prohibited MPC to port-based endp */ MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNBACKUPRX, /* Received a SYN + MP_JOIN + backup flag */ MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINSYNACKBACKUPRX, /* Received a SYN/ACK + MP_JOIN + backup flag */ MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_JOINSYNTX, /* Sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXCREATSKERR, /* Not able to create a socket when sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXBINDERR, /* Not able to bind() the address when sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXCONNECTERR, /* Not able to connect() when sending a SYN + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ MPTCP_MIB_DSSCORRUPTIONFALLBACK,/* DSS corruption detected, fallback */ MPTCP_MIB_DSSCORRUPTIONRESET, /* DSS corruption detected, MPJ subflow reset */ MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */ MPTCP_MIB_NODSSWINDOW, /* Segments not in MPTCP windows */ MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */ MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */ MPTCP_MIB_ADDADDRTX, /* Sent ADD_ADDR with echo-flag=0 */ MPTCP_MIB_ADDADDRTXDROP, /* ADD_ADDR with echo-flag=0 not send due to * resource exhaustion */ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */ MPTCP_MIB_ECHOADDTX, /* Send ADD_ADDR with echo-flag=1 */ MPTCP_MIB_ECHOADDTXDROP, /* ADD_ADDR with echo-flag=1 not send due * to resource exhaustion */ MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */ MPTCP_MIB_ADDADDRDROP, /* Dropped incoming ADD_ADDR */ MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */ MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */ MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */ MPTCP_MIB_RMADDR, /* Received RM_ADDR */ MPTCP_MIB_RMADDRDROP, /* Dropped incoming RM_ADDR */ MPTCP_MIB_RMADDRTX, /* Sent RM_ADDR */ MPTCP_MIB_RMADDRTXDROP, /* RM_ADDR not sent due to resource exhaustion */ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */ MPTCP_MIB_MPFAILTX, /* Transmit a MP_FAIL */ MPTCP_MIB_MPFAILRX, /* Received a MP_FAIL */ MPTCP_MIB_MPFASTCLOSETX, /* Transmit a MP_FASTCLOSE */ MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */ MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */ MPTCP_MIB_MPRSTRX, /* Received a MP_RST */ MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */ MPTCP_MIB_RCVWNDSHARED, /* Subflow rcv wnd is overridden by msk's one */ MPTCP_MIB_RCVWNDCONFLICTUPDATE, /* subflow rcv wnd is overridden by msk's one due to * conflict with another subflow while updating msk rcv wnd */ MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */ MPTCP_MIB_CURRESTAB, /* Current established MPTCP connections */ MPTCP_MIB_BLACKHOLE, /* A blackhole has been detected */ __MPTCP_MIB_MAX }; #define LINUX_MIB_MPTCP_MAX __MPTCP_MIB_MAX struct mptcp_mib { unsigned long mibs[LINUX_MIB_MPTCP_MAX]; }; static inline void MPTCP_ADD_STATS(struct net *net, enum linux_mptcp_mib_field field, int val) { if (likely(net->mib.mptcp_statistics)) SNMP_ADD_STATS(net->mib.mptcp_statistics, field, val); } static inline void MPTCP_INC_STATS(struct net *net, enum linux_mptcp_mib_field field) { if (likely(net->mib.mptcp_statistics)) SNMP_INC_STATS(net->mib.mptcp_statistics, field); } static inline void __MPTCP_INC_STATS(struct net *net, enum linux_mptcp_mib_field field) { if (likely(net->mib.mptcp_statistics)) __SNMP_INC_STATS(net->mib.mptcp_statistics, field); } static inline void MPTCP_DEC_STATS(struct net *net, enum linux_mptcp_mib_field field) { if (likely(net->mib.mptcp_statistics)) SNMP_DEC_STATS(net->mib.mptcp_statistics, field); } bool mptcp_mib_alloc(struct net *net);
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_SCHED_H #define __NET_PKT_SCHED_H #include <linux/jiffies.h> #include <linux/ktime.h> #include <linux/if_vlan.h> #include <linux/netdevice.h> #include <net/sch_generic.h> #include <net/net_namespace.h> #include <uapi/linux/pkt_sched.h> #define DEFAULT_TX_QUEUE_LEN 1000 #define STAB_SIZE_LOG_MAX 30 struct qdisc_walker { int stop; int skip; int count; int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); }; #define qdisc_priv(q) \ _Generic(q, \ const struct Qdisc * : (const void *)&q->privdata, \ struct Qdisc * : (void *)&q->privdata) static inline struct Qdisc *qdisc_from_priv(void *priv) { return container_of(priv, struct Qdisc, privdata); } /* Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth Normal IP packet size ~ 512byte, hence: 0.5Kbyte/1Mbyte/sec = 0.5msec, so that we need 50usec timer for 10Mbit ethernet. 10msec resolution -> <50Kbit/sec. The result: [34]86 is not good choice for QoS router :-( The things are not so bad, because we may use artificial clock evaluated by integration of network data flow in the most critical places. */ typedef u64 psched_time_t; typedef long psched_tdiff_t; /* Avoid doing 64 bit divide */ #define PSCHED_SHIFT 6 #define PSCHED_TICKS2NS(x) ((s64)(x) << PSCHED_SHIFT) #define PSCHED_NS2TICKS(x) ((x) >> PSCHED_SHIFT) #define PSCHED_TICKS_PER_SEC PSCHED_NS2TICKS(NSEC_PER_SEC) #define PSCHED_PASTPERFECT 0 static inline psched_time_t psched_get_time(void) { return PSCHED_NS2TICKS(ktime_get_ns()); } struct qdisc_watchdog { struct hrtimer timer; struct Qdisc *qdisc; }; void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, u64 delta_ns); static inline void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { return qdisc_watchdog_schedule_range_ns(wd, expires, 0ULL); } static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) { qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires)); } void qdisc_watchdog_cancel(struct qdisc_watchdog *wd); extern struct Qdisc_ops pfifo_qdisc_ops; extern struct Qdisc_ops bfifo_qdisc_ops; extern struct Qdisc_ops pfifo_head_drop_qdisc_ops; int fifo_set_limit(struct Qdisc *q, unsigned int limit); struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, unsigned int limit, struct netlink_ext_ack *extack); int register_qdisc(struct Qdisc_ops *qops); void unregister_qdisc(struct Qdisc_ops *qops); #define NET_SCH_ALIAS_PREFIX "net-sch-" #define MODULE_ALIAS_NET_SCH(id) MODULE_ALIAS(NET_SCH_ALIAS_PREFIX id) void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); void qdisc_hash_add(struct Qdisc *q, bool invisible); void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle); struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack); void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { __qdisc_run(q); qdisc_run_end(q); } } extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. */ static inline unsigned int psched_mtu(const struct net_device *dev) { return READ_ONCE(dev->mtu) + dev->hard_header_len; } static inline struct net *qdisc_net(struct Qdisc *q) { return dev_net(q->dev_queue->dev); } struct tc_query_caps_base { enum tc_setup_type type; void *caps; }; struct tc_cbs_qopt_offload { u8 enable; s32 queue; s32 hicredit; s32 locredit; s32 idleslope; s32 sendslope; }; struct tc_etf_qopt_offload { u8 enable; s32 queue; }; struct tc_mqprio_caps { bool validate_queue_counts:1; }; struct tc_mqprio_qopt_offload { /* struct tc_mqprio_qopt must always be the first element */ struct tc_mqprio_qopt qopt; struct netlink_ext_ack *extack; u16 mode; u16 shaper; u32 flags; u64 min_rate[TC_QOPT_MAX_QUEUE]; u64 max_rate[TC_QOPT_MAX_QUEUE]; unsigned long preemptible_tcs; }; struct tc_taprio_caps { bool supports_queue_max_sdu:1; bool gate_mask_per_txq:1; /* Device expects lower TXQ numbers to have higher priority over higher * TXQs, regardless of their TC mapping. DO NOT USE FOR NEW DRIVERS, * INSTEAD ENFORCE A PROPER TC:TXQ MAPPING COMING FROM USER SPACE. */ bool broken_mqprio:1; }; enum tc_taprio_qopt_cmd { TAPRIO_CMD_REPLACE, TAPRIO_CMD_DESTROY, TAPRIO_CMD_STATS, TAPRIO_CMD_QUEUE_STATS, }; /** * struct tc_taprio_qopt_stats - IEEE 802.1Qbv statistics * @window_drops: Frames that were dropped because they were too large to be * transmitted in any of the allotted time windows (open gates) for their * traffic class. * @tx_overruns: Frames still being transmitted by the MAC after the * transmission gate associated with their traffic class has closed. * Equivalent to `12.29.1.1.2 TransmissionOverrun` from 802.1Q-2018. */ struct tc_taprio_qopt_stats { u64 window_drops; u64 tx_overruns; }; struct tc_taprio_qopt_queue_stats { int queue; struct tc_taprio_qopt_stats stats; }; struct tc_taprio_sched_entry { u8 command; /* TC_TAPRIO_CMD_* */ /* The gate_mask in the offloading side refers to traffic classes */ u32 gate_mask; u32 interval; }; struct tc_taprio_qopt_offload { enum tc_taprio_qopt_cmd cmd; union { /* TAPRIO_CMD_STATS */ struct tc_taprio_qopt_stats stats; /* TAPRIO_CMD_QUEUE_STATS */ struct tc_taprio_qopt_queue_stats queue_stats; /* TAPRIO_CMD_REPLACE */ struct { struct tc_mqprio_qopt_offload mqprio; struct netlink_ext_ack *extack; ktime_t base_time; u64 cycle_time; u64 cycle_time_extension; u32 max_sdu[TC_MAX_QUEUE]; size_t num_entries; struct tc_taprio_sched_entry entries[]; }; }; }; #if IS_ENABLED(CONFIG_NET_SCH_TAPRIO) /* Reference counting */ struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload); void taprio_offload_free(struct tc_taprio_qopt_offload *offload); #else /* Reference counting */ static inline struct tc_taprio_qopt_offload * taprio_offload_get(struct tc_taprio_qopt_offload *offload) { return NULL; } static inline void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { } #endif /* Ensure skb_mstamp_ns, which might have been populated with the txtime, is * not mistaken for a software timestamp, because this will otherwise prevent * the dispatch of hardware timestamps to the socket. */ static inline void skb_txtime_consumed(struct sk_buff *skb) { skb->tstamp = ktime_set(0, 0); } static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, unsigned long cl, struct qdisc_walker *arg) { if (arg->count >= arg->skip && arg->fn(sch, cl, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #endif
832 303 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FIB_LOOKUP_H #define _FIB_LOOKUP_H #include <linux/types.h> #include <linux/list.h> #include <net/inet_dscp.h> #include <net/ip_fib.h> #include <net/nexthop.h> struct fib_alias { struct hlist_node fa_list; struct fib_info *fa_info; dscp_t fa_dscp; u8 fa_type; u8 fa_state; u8 fa_slen; u32 tb_id; s16 fa_default; u8 offload; u8 trap; u8 offload_failed; struct rcu_head rcu; }; #define FA_S_ACCESSED 0x01 /* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { if (!(fa->fa_state & FA_S_ACCESSED)) fa->fa_state |= FA_S_ACCESSED; } /* Exported by fib_semantics.c */ void fib_release_info(struct fib_info *); struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack); int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, const struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); size_t fib_nlmsg_size(struct fib_info *fi); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) { /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; res->nhc = fib_info_nhc(fi, 0); } struct fib_prop { int error; u8 scope; }; extern const struct fib_prop fib_props[RTN_MAX + 1]; #endif /* _FIB_LOOKUP_H */
67 67 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 // SPDX-License-Identifier: GPL-2.0 #include <linux/sysctl.h> #include <net/lwtunnel.h> #include <net/netfilter/nf_hooks_lwtunnel.h> #include <linux/netfilter.h> #include "nf_internals.h" static inline int nf_hooks_lwtunnel_get(void) { if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return 1; else return 0; } static inline int nf_hooks_lwtunnel_set(int enable) { if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) { if (!enable) return -EBUSY; } else if (enable) { static_branch_enable(&nf_hooks_lwtunnel_enabled); } return 0; } #ifdef CONFIG_SYSCTL int nf_hooks_lwtunnel_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int proc_nf_hooks_lwtunnel_enabled = 0; struct ctl_table tmp = { .procname = table->procname, .data = &proc_nf_hooks_lwtunnel_enabled, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; int ret; if (!write) proc_nf_hooks_lwtunnel_enabled = nf_hooks_lwtunnel_get(); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = nf_hooks_lwtunnel_set(proc_nf_hooks_lwtunnel_enabled); return ret; } EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler); static struct ctl_table nf_lwtunnel_sysctl_table[] = { { .procname = "nf_hooks_lwtunnel", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = nf_hooks_lwtunnel_sysctl_handler, }, }; static int __net_init nf_lwtunnel_net_init(struct net *net) { struct ctl_table_header *hdr; struct ctl_table *table; table = nf_lwtunnel_sysctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(nf_lwtunnel_sysctl_table, sizeof(nf_lwtunnel_sysctl_table), GFP_KERNEL); if (!table) goto err_alloc; } hdr = register_net_sysctl_sz(net, "net/netfilter", table, ARRAY_SIZE(nf_lwtunnel_sysctl_table)); if (!hdr) goto err_reg; net->nf.nf_lwtnl_dir_header = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit nf_lwtunnel_net_exit(struct net *net) { const struct ctl_table *table; table = net->nf.nf_lwtnl_dir_header->ctl_table_arg; unregister_net_sysctl_table(net->nf.nf_lwtnl_dir_header); if (!net_eq(net, &init_net)) kfree(table); } static struct pernet_operations nf_lwtunnel_net_ops = { .init = nf_lwtunnel_net_init, .exit = nf_lwtunnel_net_exit, }; int __init netfilter_lwtunnel_init(void) { return register_pernet_subsys(&nf_lwtunnel_net_ops); } void netfilter_lwtunnel_fini(void) { unregister_pernet_subsys(&nf_lwtunnel_net_ops); } #else int __init netfilter_lwtunnel_init(void) { return 0; } void netfilter_lwtunnel_fini(void) {} #endif /* CONFIG_SYSCTL */
1110 11 25 25 25 25 25 25 25 25 25 25 25 11 11 25 25 11 2 2 42 42 1053 1057 1044 1048 839 838 840 839 833 783 266 245 338 1 158 339 338 158 340 420 255 254 255 254 7 7 7 7 1033 1071 832 831 255 216 229 256 178 35 149 821 823 823 822 818 821 823 820 821 822 15 171 658 968 968 967 862 966 653 653 745 744 281 104 275 274 273 250 249 177 153 153 160 2 158 156 949 894 416 954 953 951 207 208 824 822 822 820 819 172 824 254 255 255 255 255 254 255 255 254 255 254 254 254 204 203 258 258 254 156 129 289 190 250 42 42 42 42 42 42 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/dir.c - kernfs directory implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/sched.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/hash.h> #include "kernfs-internal.h" static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() * will perform wakeups when releasing console_sem. Holding rename_lock * will introduce deadlock if the scheduler reads the kernfs_name in the * wakeup path. */ static DEFINE_SPINLOCK(kernfs_pr_cont_lock); static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) static bool __kernfs_active(struct kernfs_node *kn) { return atomic_read(&kn->active) >= 0; } static bool kernfs_active(struct kernfs_node *kn) { lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); return __kernfs_active(kn); } static bool kernfs_lockdep(struct kernfs_node *kn) { #ifdef CONFIG_DEBUG_LOCK_ALLOC return kn->flags & KERNFS_LOCKDEP; #else return false; #endif } static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) { if (!kn) return strscpy(buf, "(null)", buflen); return strscpy(buf, kn->parent ? kn->name : "/", buflen); } /* kernfs_node_depth - compute depth from @from to @to */ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) { size_t depth = 0; while (to->parent && to != from) { depth++; to = to->parent; } return depth; } static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, struct kernfs_node *b) { size_t da, db; struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b); if (ra != rb) return NULL; da = kernfs_depth(ra->kn, a); db = kernfs_depth(rb->kn, b); while (da > db) { a = a->parent; da--; } while (db > da) { b = b->parent; db--; } /* worst case b and a will be the same at root */ while (b != a) { b = b->parent; a = a->parent; } return a; } /** * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to, * where kn_from is treated as root of the path. * @kn_from: kernfs node which should be treated as root for the path * @kn_to: kernfs node to which path is needed * @buf: buffer to copy the path into * @buflen: size of @buf * * We need to handle couple of scenarios here: * [1] when @kn_from is an ancestor of @kn_to at some level * kn_from: /n1/n2/n3 * kn_to: /n1/n2/n3/n4/n5 * result: /n4/n5 * * [2] when @kn_from is on a different hierarchy and we need to find common * ancestor between @kn_from and @kn_to. * kn_from: /n1/n2/n3/n4 * kn_to: /n1/n2/n5 * result: /../../n5 * OR * kn_from: /n1/n2/n3/n4/n5 [depth=5] * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * * [3] when @kn_to is %NULL result will be "(null)" * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, char *buf, size_t buflen) { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; ssize_t copied; int i, j; if (!kn_to) return strscpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) return strscpy(buf, "/", buflen); common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); buf[0] = '\0'; for (i = 0; i < depth_from; i++) { copied = strscpy(buf + len, parent_str, buflen - len); if (copied < 0) return copied; len += copied; } /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { for (kn = kn_to, j = 0; j < i; j++) kn = kn->parent; len += scnprintf(buf + len, buflen - len, "/%s", kn->name); } return len; } /** * kernfs_name - obtain the name of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is * similar to strscpy(). * * Fills buffer with "(null)" if @kn is %NULL. * * Return: the resulting length of @buf. If @buf isn't long enough, * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG. * * This function can be called from any context. */ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { unsigned long flags; int ret; read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_name_locked(kn, buf, buflen); read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } /** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest * @buf: buffer to copy @to's path into * @buflen: size of @buf * * Builds @to's path relative to @from in @buf. @from and @to must * be on the same kernfs-root. If @from is not parent of @to, then a relative * path (which includes '..'s) as needed to reach from @from to @to is * returned. * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) { unsigned long flags; int ret; read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_path_from_node_locked(to, from, buf, buflen); read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_name(struct kernfs_node *kn) { unsigned long flags; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); pr_cont("%s", kernfs_pr_cont_buf); spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * pr_cont_kernfs_path - pr_cont path of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_path(struct kernfs_node *kn) { unsigned long flags; int sz; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); if (sz < 0) { if (sz == -E2BIG) pr_cont("(name too long)"); else pr_cont("(error)"); goto out; } pr_cont("%s", kernfs_pr_cont_buf); out: spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * kernfs_get_parent - determine the parent node and pin it * @kn: kernfs_node of interest * * Determines @kn's parent, pins and returns it. This function can be * called from any context. * * Return: parent node of @kn */ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { struct kernfs_node *parent; unsigned long flags; read_lock_irqsave(&kernfs_rename_lock, flags); parent = kn->parent; kernfs_get(parent); read_unlock_irqrestore(&kernfs_rename_lock, flags); return parent; } /** * kernfs_name_hash - calculate hash of @ns + @name * @name: Null terminated string to hash * @ns: Namespace tag to hash * * Return: 31-bit hash of ns + name (so it fits in an off_t) */ static unsigned int kernfs_name_hash(const char *name, const void *ns) { unsigned long hash = init_name_hash(ns); unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); hash = end_name_hash(hash); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 2) hash += 2; if (hash >= INT_MAX) hash = INT_MAX - 1; return hash; } static int kernfs_name_compare(unsigned int hash, const char *name, const void *ns, const struct kernfs_node *kn) { if (hash < kn->hash) return -1; if (hash > kn->hash) return 1; if (ns < kn->ns) return -1; if (ns > kn->ns) return 1; return strcmp(name, kn->name); } static int kernfs_sd_compare(const struct kernfs_node *left, const struct kernfs_node *right) { return kernfs_name_compare(left->hash, left->name, left->ns, right); } /** * kernfs_link_sibling - link kernfs_node into sibling rbtree * @kn: kernfs_node of interest * * Link @kn into its sibling rbtree which starts from * @kn->parent->dir.children. * * Locking: * kernfs_rwsem held exclusive * * Return: * %0 on success, -EEXIST on failure. */ static int kernfs_link_sibling(struct kernfs_node *kn) { struct rb_node **node = &kn->parent->dir.children.rb_node; struct rb_node *parent = NULL; while (*node) { struct kernfs_node *pos; int result; pos = rb_to_kn(*node); parent = *node; result = kernfs_sd_compare(kn, pos); if (result < 0) node = &pos->rb.rb_left; else if (result > 0) node = &pos->rb.rb_right; else return -EEXIST; } /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn->parent->dir.children); /* successfully added, account subdir number */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs++; kernfs_inc_rev(kn->parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); return 0; } /** * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree * @kn: kernfs_node of interest * * Try to unlink @kn from its sibling rbtree which starts from * kn->parent->dir.children. * * Return: %true if @kn was actually removed, * %false if @kn wasn't on the rbtree. * * Locking: * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { if (RB_EMPTY_NODE(&kn->rb)) return false; down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--; kernfs_inc_rev(kn->parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); rb_erase(&kn->rb, &kn->parent->dir.children); RB_CLEAR_NODE(&kn->rb); return true; } /** * kernfs_get_active - get an active reference to kernfs_node * @kn: kernfs_node to get an active reference to * * Get an active reference of @kn. This function is noop if @kn * is %NULL. * * Return: * Pointer to @kn on success, %NULL on failure. */ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) { if (unlikely(!kn)) return NULL; if (!atomic_inc_unless_negative(&kn->active)) return NULL; if (kernfs_lockdep(kn)) rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); return kn; } /** * kernfs_put_active - put an active reference to kernfs_node * @kn: kernfs_node to put an active reference to * * Put an active reference to @kn. This function is noop if @kn * is %NULL. */ void kernfs_put_active(struct kernfs_node *kn) { int v; if (unlikely(!kn)) return; if (kernfs_lockdep(kn)) rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; wake_up_all(&kernfs_root(kn)->deactivate_waitq); } /** * kernfs_drain - drain kernfs_node * @kn: kernfs_node to drain * * Drain existing usages and nuke all existing mmaps of @kn. Multiple * removers may invoke this function concurrently on @kn and all will * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) __releases(&kernfs_root(kn)->kernfs_rwsem) __acquires(&kernfs_root(kn)->kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); /* * Skip draining if already fully drained. This avoids draining and its * lockdep annotations for nodes which have never been activated * allowing embedding kernfs_remove() in create error paths without * worrying about draining. */ if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && !kernfs_should_drain_open_files(kn)) return; up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) lock_contended(&kn->dep_map, _RET_IP_); } wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); rwsem_release(&kn->dep_map, _RET_IP_); } if (kernfs_should_drain_open_files(kn)) kernfs_drain_open_files(kn); down_write(&root->kernfs_rwsem); } /** * kernfs_get - get a reference count on a kernfs_node * @kn: the target kernfs_node */ void kernfs_get(struct kernfs_node *kn) { if (kn) { WARN_ON(!atomic_read(&kn->count)); atomic_inc(&kn->count); } } EXPORT_SYMBOL_GPL(kernfs_get); static void kernfs_free_rcu(struct rcu_head *rcu) { struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); kfree_const(kn->name); if (kn->iattr) { simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } kmem_cache_free(kernfs_node_cache, kn); } /** * kernfs_put - put a reference count on a kernfs_node * @kn: the target kernfs_node * * Put a reference count of @kn and destroy it if it reached zero. */ void kernfs_put(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); repeat: /* * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ parent = kn->parent; WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, "kernfs_put: %s/%s: released with incorrect active_ref %d\n", parent ? parent->name : "", kn->name, atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); call_rcu(&kn->rcu, kernfs_free_rcu); kn = parent; if (kn) { if (atomic_dec_and_test(&kn->count)) goto repeat; } else { /* just released the root kn, free @root too */ idr_destroy(&root->ino_idr); kfree_rcu(root, rcu); } } EXPORT_SYMBOL_GPL(kernfs_put); /** * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question * * Return: the kernfs_node associated with @dentry. If @dentry is not a * kernfs one, %NULL is returned. * * While the returned kernfs_node will stay accessible as long as @dentry * is accessible, the returned node can be in any state and the caller is * fully responsible for determining what's accessible. */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { if (dentry->d_sb->s_op == &kernfs_sops) return kernfs_dentry_node(dentry); return NULL; } static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); if (!name) return NULL; kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); if (!kn) goto err_out1; idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++; id_highbits = root->id_highbits; root->last_id_lowbits = ret; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; kn->id = (u64)id_highbits << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); kn->name = name; kn->mode = mode; kn->flags = flags; if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = uid, .ia_gid = gid, }; ret = __kernfs_setattr(kn, &iattr); if (ret < 0) goto err_out3; } if (parent) { ret = security_kernfs_init_security(parent, kn); if (ret) goto err_out3; } return kn; err_out3: spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: kfree_const(name); return NULL; } struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; if (parent->mode & S_ISGID) { /* this code block imitates inode_init_owner() for * kernfs */ if (parent->iattr) gid = parent->iattr->ia_gid; if (flags & KERNFS_DIR) mode |= S_ISGID; } kn = __kernfs_new_node(kernfs_root(parent), parent, name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); kn->parent = parent; } return kn; } /* * kernfs_find_and_get_node_by_id - get kernfs_node from node id * @root: the kernfs root * @id: the target node id * * @id's lower 32bits encode ino and upper gen. If the gen portion is * zero, all generations are matched. * * Return: %NULL on failure, * otherwise a kernfs node with reference counter incremented. */ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id) { struct kernfs_node *kn; ino_t ino = kernfs_id_ino(id); u32 gen = kernfs_id_gen(id); rcu_read_lock(); kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) goto err_unlock; if (sizeof(ino_t) >= sizeof(u64)) { /* we looked up with the low 32bits, compare the whole */ if (kernfs_ino(kn) != ino) goto err_unlock; } else { /* 0 matches all generations */ if (unlikely(gen && kernfs_gen(kn) != gen)) goto err_unlock; } /* * We should fail if @kn has never been activated and guarantee success * if the caller knows that @kn is active. Both can be achieved by * __kernfs_active() which tests @kn->active without kernfs_rwsem. */ if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; rcu_read_unlock(); return kn; err_unlock: rcu_read_unlock(); return NULL; } /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added * * The caller must already have initialized @kn->parent. This * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * * Return: * %0 on success, -EEXIST if entry with the given name already * exists. */ int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_node *parent = kn->parent; struct kernfs_root *root = kernfs_root(parent); struct kernfs_iattrs *ps_iattr; bool has_ns; int ret; down_write(&root->kernfs_rwsem); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", parent->name, kn->name)) goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) goto out_unlock; ret = -ENOENT; if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; kn->hash = kernfs_name_hash(kn->name, kn->ns); ret = kernfs_link_sibling(kn); if (ret) goto out_unlock; /* Update timestamps on the parent */ down_write(&root->kernfs_iattr_rwsem); ps_iattr = parent->iattr; if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&root->kernfs_iattr_rwsem); up_write(&root->kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. * If not activated here, the kernfs user is responsible for * activating the node with kernfs_activate(). A node which hasn't * been activated is not visible to userland and its removal won't * trigger deactivation. */ if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return 0; out_unlock: up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_find_ns - find kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, const unsigned char *name, const void *ns) { struct rb_node *node = parent->dir.children.rb_node; bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", parent->name, name); return NULL; } hash = kernfs_name_hash(name, ns); while (node) { struct kernfs_node *kn; int result; kn = rb_to_kn(node); result = kernfs_name_compare(hash, name, ns, kn); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return kn; } return NULL; } static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const void *ns) { ssize_t len; char *p, *name; lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); spin_lock_irq(&kernfs_pr_cont_lock); len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); if (len < 0) { spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } p = kernfs_pr_cont_buf; while ((name = strsep(&p, "/")) && parent) { if (*name == '\0') continue; parent = kernfs_find_ns(parent, name, ns); } spin_unlock_irq(&kernfs_pr_cont_lock); return parent; } /** * kernfs_find_and_get_ns - find and get kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** * kernfs_walk_and_get_ns - find and get kernfs_node with the given path * @parent: kernfs_node to search under * @path: path to look for * @ns: the namespace tag to use * * Look for kernfs_node with path @path under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } /** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * * Return: the root of the new hierarchy on success, ERR_PTR() value on * failure. */ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { struct kernfs_root *root; struct kernfs_node *kn; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); idr_init(&root->ino_idr); init_rwsem(&root->kernfs_rwsem); init_rwsem(&root->kernfs_iattr_rwsem); init_rwsem(&root->kernfs_supers_rwsem); INIT_LIST_HEAD(&root->supers); /* * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. * High bits generation. The starting value for both ino and * genenration is 1. Initialize upper 32bit allocation * accordingly. */ if (sizeof(ino_t) >= sizeof(u64)) root->id_highbits = 0; else root->id_highbits = 1; kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) { idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } kn->priv = priv; kn->dir.root = root; root->syscall_ops = scops; root->flags = flags; root->kn = kn; init_waitqueue_head(&root->deactivate_waitq); if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return root; } /** * kernfs_destroy_root - destroy a kernfs hierarchy * @root: root of the hierarchy to destroy * * Destroy the hierarchy anchored at @root by removing all existing * directories and destroying @root. */ void kernfs_destroy_root(struct kernfs_root *root) { /* * kernfs_remove holds kernfs_rwsem from the root so the root * shouldn't be freed during the operation. */ kernfs_get(root->kn); kernfs_remove(root->kn); kernfs_put(root->kn); /* will also free @root */ } /** * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root * @root: root to use to lookup * * Return: @root's kernfs_node */ struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) { return root->kn; } /** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory * @uid: uid of the new directory * @gid: gid of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, mode | S_IFDIR, uid, gid, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->dir.root = parent->dir.root; kn->ns = ns; kn->priv = priv; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } /** * kernfs_create_empty_dir - create an always empty directory * @parent: parent in which to create a new directory * @name: name of the new directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->flags |= KERNFS_EMPTY_DIR; kn->dir.root = parent->dir.root; kn->ns = NULL; kn->priv = NULL; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn; struct kernfs_root *root; if (flags & LOOKUP_RCU) return -ECHILD; /* Negative hashed dentry? */ if (d_really_is_negative(dentry)) { struct kernfs_node *parent; /* If the kernfs parent node has changed discard and * proceed to ->lookup. * * There's nothing special needed here when getting the * dentry parent, even if a concurrent rename is in * progress. That's because the dentry is negative so * it can only be the target of the rename and it will * be doing a d_move() not a replace. Consequently the * dentry d_parent won't change over the d_move(). * * Also kernfs negative dentries transitioning from * negative to positive during revalidate won't happen * because they are invalidated on containing directory * changes and the lookup re-done so that a new positive * dentry can be properly created. */ root = kernfs_root_from_sb(dentry->d_sb); down_read(&root->kernfs_rwsem); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { if (kernfs_dir_changed(parent, dentry)) { up_read(&root->kernfs_rwsem); return 0; } } up_read(&root->kernfs_rwsem); /* The kernfs parent node hasn't changed, leave the * dentry negative and return success. */ return 1; } kn = kernfs_dentry_node(dentry); root = kernfs_root(kn); down_read(&root->kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) goto out_bad; /* The kernfs node has been moved? */ if (kernfs_dentry_node(dentry->d_parent) != kn->parent) goto out_bad; /* The kernfs node has been renamed */ if (strcmp(dentry->d_name.name, kn->name) != 0) goto out_bad; /* The kernfs node has been moved to a different namespace */ if (kn->parent && kernfs_ns_enabled(kn->parent) && kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; up_read(&root->kernfs_rwsem); return 1; out_bad: up_read(&root->kernfs_rwsem); return 0; } const struct dentry_operations kernfs_dops = { .d_revalidate = kernfs_dop_revalidate, }; static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct kernfs_root *root; struct inode *inode = NULL; const void *ns = NULL; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* attach dentry and inode */ if (kn) { /* Inactive nodes are invisible to the VFS so don't * create a negative. */ if (!kernfs_active(kn)) { up_read(&root->kernfs_rwsem); return NULL; } inode = kernfs_get_inode(dir->i_sb, kn); if (!inode) inode = ERR_PTR(-ENOMEM); } /* * Needed for negative dentry validation. * The negative dentry can be created in kernfs_iop_lookup() * or transforms from positive dentry in dentry_unlink_inode() * called from vfs_rmdir(). */ if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); up_read(&root->kernfs_rwsem); /* instantiate and hash (possibly negative) dentry */ return d_splice_alias(inode, dentry); } static int kernfs_iop_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct kernfs_node *parent = dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; int ret; if (!scops || !scops->mkdir) return -EPERM; if (!kernfs_get_active(parent)) return -ENODEV; ret = scops->mkdir(parent, dentry->d_name.name, mode); kernfs_put_active(parent); return ret; } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (!scops || !scops->rmdir) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; ret = scops->rmdir(kn); kernfs_put_active(kn); return ret; } static int kernfs_iop_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (flags) return -EINVAL; if (!scops || !scops->rename) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; if (!kernfs_get_active(new_parent)) { kernfs_put_active(kn); return -ENODEV; } ret = scops->rename(kn, new_parent, new_dentry->d_name.name); kernfs_put_active(new_parent); kernfs_put_active(kn); return ret; } const struct inode_operations kernfs_dir_iops = { .lookup = kernfs_iop_lookup, .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .listxattr = kernfs_iop_listxattr, .mkdir = kernfs_iop_mkdir, .rmdir = kernfs_iop_rmdir, .rename = kernfs_iop_rename, }; static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) { struct kernfs_node *last; while (true) { struct rb_node *rbn; last = pos; if (kernfs_type(pos) != KERNFS_DIR) break; rbn = rb_first(&pos->dir.children); if (!rbn) break; pos = rb_to_kn(rbn); } return last; } /** * kernfs_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: kernfs_node whose descendants to walk * * Find the next descendant to visit for post-order traversal of @root's * descendants. @root is included in the iteration and the last node to be * visited. * * Return: the next descendant to visit or %NULL when done. */ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, struct kernfs_node *root) { struct rb_node *rbn; lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) return kernfs_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ rbn = rb_next(&pos->rb); if (rbn) return kernfs_leftmost_descendant(rb_to_kn(rbn)); /* no sibling left, visit parent */ return pos->parent; } static void kernfs_activate_one(struct kernfs_node *kn) { lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); kn->flags |= KERNFS_ACTIVATED; if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) return; WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb)); WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); } /** * kernfs_activate - activate a node which started deactivated * @kn: kernfs_node whose subtree is to be activated * * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node * needs to be explicitly activated. A node which hasn't been activated * isn't visible to userland and deactivation is skipped during its * removal. This is useful to construct atomic init sequences where * creation of multiple nodes should either succeed or fail atomically. * * The caller is responsible for ensuring that this function is not called * after kernfs_remove*() is invoked on @kn. */ void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) kernfs_activate_one(pos); up_write(&root->kernfs_rwsem); } /** * kernfs_show - show or hide a node * @kn: kernfs_node to show or hide * @show: whether to show or hide * * If @show is %false, @kn is marked hidden and deactivated. A hidden node is * ignored in future activaitons. If %true, the mark is removed and activation * state is restored. This function won't implicitly activate a new node in a * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet. * * To avoid recursion complexities, directories aren't supported for now. */ void kernfs_show(struct kernfs_node *kn, bool show) { struct kernfs_root *root = kernfs_root(kn); if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR)) return; down_write(&root->kernfs_rwsem); if (show) { kn->flags &= ~KERNFS_HIDDEN; if (kn->flags & KERNFS_ACTIVATED) kernfs_activate_one(kn); } else { kn->flags |= KERNFS_HIDDEN; if (kernfs_active(kn)) atomic_add(KN_DEACTIVATED_BIAS, &kn->active); kernfs_drain(kn); } up_write(&root->kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos; /* Short-circuit if non-root @kn has already finished removal. */ if (!kn) return; lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); /* * This is for kernfs_remove_self() which plays with active ref * after removal. */ if (kn->parent && RB_EMPTY_NODE(&kn->rb)) return; pr_debug("kernfs %s: removing\n", kn->name); /* prevent new usage by marking all nodes removing and deactivating */ pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { pos->flags |= KERNFS_REMOVING; if (kernfs_active(pos)) atomic_add(KN_DEACTIVATED_BIAS, &pos->active); } /* deactivate and unlink the subtree node-by-node */ do { pos = kernfs_leftmost_descendant(kn); /* * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. */ kernfs_get(pos); kernfs_drain(pos); /* * kernfs_unlink_sibling() succeeds once per node. Use it * to decide who's responsible for cleanups. */ if (!pos->parent || kernfs_unlink_sibling(pos)) { struct kernfs_iattrs *ps_iattr = pos->parent ? pos->parent->iattr : NULL; /* update timestamps on the parent */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); kernfs_put(pos); } kernfs_put(pos); } while (pos != kn); } /** * kernfs_remove - remove a kernfs_node recursively * @kn: the kernfs_node to remove * * Remove @kn along with all its subdirectories and files. */ void kernfs_remove(struct kernfs_node *kn) { struct kernfs_root *root; if (!kn) return; root = kernfs_root(kn); down_write(&root->kernfs_rwsem); __kernfs_remove(kn); up_write(&root->kernfs_rwsem); } /** * kernfs_break_active_protection - break out of active protection * @kn: the self kernfs_node * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. Each invocation of * this function must also be matched with an invocation of * kernfs_unbreak_active_protection(). * * This function releases the active reference of @kn the caller is * holding. Once this function is called, @kn may be removed at any point * and the caller is solely responsible for ensuring that the objects it * dereferences are accessible. */ void kernfs_break_active_protection(struct kernfs_node *kn) { /* * Take out ourself out of the active ref dependency chain. If * we're called without an active ref, lockdep will complain. */ kernfs_put_active(kn); } /** * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() * @kn: the self kernfs_node * * If kernfs_break_active_protection() was called, this function must be * invoked before finishing the kernfs operation. Note that while this * function restores the active reference, it doesn't and can't actually * restore the active protection - @kn may already or be in the process of * being removed. Once kernfs_break_active_protection() is invoked, that * protection is irreversibly gone for the kernfs operation instance. * * While this function may be called at any point after * kernfs_break_active_protection() is invoked, its most useful location * would be right before the enclosing kernfs operation returns. */ void kernfs_unbreak_active_protection(struct kernfs_node *kn) { /* * @kn->active could be in any state; however, the increment we do * here will be undone as soon as the enclosing kernfs operation * finishes and this temporary bump can't break anything. If @kn * is alive, nothing changes. If @kn is being deactivated, the * soon-to-follow put will either finish deactivation or restore * deactivated state. If @kn is already removed, the temporary * bump is guaranteed to be gone before @kn is released. */ atomic_inc(&kn->active); if (kernfs_lockdep(kn)) rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); } /** * kernfs_remove_self - remove a kernfs_node from its own method * @kn: the self kernfs_node to remove * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. This can be used to * implement a file operation which deletes itself. * * For example, the "delete" file for a sysfs device directory can be * implemented by invoking kernfs_remove_self() on the "delete" file * itself. This function breaks the circular dependency of trying to * deactivate self while holding an active ref itself. It isn't necessary * to modify the usual removal path to use kernfs_remove_self(). The * "delete" implementation can simply invoke kernfs_remove_self() on self * before proceeding with the usual removal path. kernfs will ignore later * kernfs_remove() on self. * * kernfs_remove_self() can be called multiple times concurrently on the * same kernfs_node. Only the first one actually performs removal and * returns %true. All others will wait until the kernfs operation which * won self-removal finishes and return %false. Note that the losers wait * for the completion of not only the winning kernfs_remove_self() but also * the whole kernfs_ops which won the arbitration. This can be used to * guarantee, for example, all concurrent writes to a "delete" file to * finish only after the whole operation is complete. * * Return: %true if @kn is removed by this call, otherwise %false. */ bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_rwsem); kernfs_break_active_protection(kn); /* * SUICIDAL is used to arbitrate among competing invocations. Only * the first one will actually perform removal. When the removal * is complete, SUICIDED is set and the active ref is restored * while kernfs_rwsem for held exclusive. The ones which lost * arbitration waits for SUICIDED && drained which can happen only * after the enclosing kernfs operation which executed the winning * instance of kernfs_remove_self() finished. */ if (!(kn->flags & KERNFS_SUICIDAL)) { kn->flags |= KERNFS_SUICIDAL; __kernfs_remove(kn); kn->flags |= KERNFS_SUICIDED; ret = true; } else { wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; DEFINE_WAIT(wait); while (true) { prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); if ((kn->flags & KERNFS_SUICIDED) && atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; up_write(&root->kernfs_rwsem); schedule(); down_write(&root->kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); ret = false; } /* * This must be done while kernfs_rwsem held exclusive; otherwise, * waiting for SUICIDED && deactivated could finish prematurely. */ kernfs_unbreak_active_protection(kn); up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it * @parent: parent of the target * @name: name of the kernfs_node to remove * @ns: namespace tag of the kernfs_node to remove * * Look for the kernfs_node with @name and @ns under @parent and remove it. * * Return: %0 on success, -ENOENT if such entry doesn't exist. */ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root; if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", name); return -ENOENT; } root = kernfs_root(parent); down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) { kernfs_get(kn); __kernfs_remove(kn); kernfs_put(kn); } up_write(&root->kernfs_rwsem); if (kn) return 0; else return -ENOENT; } /** * kernfs_rename_ns - move and rename a kernfs_node * @kn: target node * @new_parent: new parent to put @sd under * @new_name: new name * @new_ns: new namespace tag * * Return: %0 on success, -errno on failure. */ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { struct kernfs_node *old_parent; struct kernfs_root *root; const char *old_name = NULL; int error; /* can't move or rename root */ if (!kn->parent) return -EINVAL; root = kernfs_root(kn); down_write(&root->kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; error = 0; if ((kn->parent == new_parent) && (kn->ns == new_ns) && (strcmp(kn->name, new_name) == 0)) goto out; /* nothing to rename */ error = -EEXIST; if (kernfs_find_ns(new_parent, new_name, new_ns)) goto out; /* rename kernfs_node */ if (strcmp(kn->name, new_name) != 0) { error = -ENOMEM; new_name = kstrdup_const(new_name, GFP_KERNEL); if (!new_name) goto out; } else { new_name = NULL; } /* * Move to the appropriate place in the appropriate directories rbtree. */ kernfs_unlink_sibling(kn); kernfs_get(new_parent); /* rename_lock protects ->parent and ->name accessors */ write_lock_irq(&kernfs_rename_lock); old_parent = kn->parent; kn->parent = new_parent; kn->ns = new_ns; if (new_name) { old_name = kn->name; kn->name = new_name; } write_unlock_irq(&kernfs_rename_lock); kn->hash = kernfs_name_hash(kn->name, kn->ns); kernfs_link_sibling(kn); kernfs_put(old_parent); kfree_const(old_name); error = 0; out: up_write(&root->kernfs_rwsem); return error; } static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) { kernfs_put(filp->private_data); return 0; } static struct kernfs_node *kernfs_dir_pos(const void *ns, struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) { if (pos) { int valid = kernfs_active(pos) && pos->parent == parent && hash == pos->hash; kernfs_put(pos); if (!valid) pos = NULL; } if (!pos && (hash > 1) && (hash < INT_MAX)) { struct rb_node *node = parent->dir.children.rb_node; while (node) { pos = rb_to_kn(node); if (hash < pos->hash) node = node->rb_left; else if (hash > pos->hash) node = node->rb_right; else break; } } /* Skip over entries which are dying/dead or in the wrong namespace */ while (pos && (!kernfs_active(pos) || pos->ns != ns)) { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } return pos; } static struct kernfs_node *kernfs_dir_next_pos(const void *ns, struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) { pos = kernfs_dir_pos(ns, parent, ino, pos); if (pos) { do { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } while (pos && (!kernfs_active(pos) || pos->ns != ns)); } return pos; } static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; struct kernfs_root *root; const void *ns = NULL; if (!dir_emit_dots(file, ctx)) return 0; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); pos; pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { const char *name = pos->name; unsigned int type = fs_umode_to_dtype(pos->mode); int len = strlen(name); ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; kernfs_get(pos); up_read(&root->kernfs_rwsem); if (!dir_emit(ctx, name, len, ino, type)) return 0; down_read(&root->kernfs_rwsem); } up_read(&root->kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; } const struct file_operations kernfs_dir_fops = { .read = generic_read_dir, .iterate_shared = kernfs_fop_readdir, .release = kernfs_dir_fop_release, .llseek = generic_file_llseek, };
622 621 622 496 622 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/symlink.c - kernfs symlink implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/gfp.h> #include <linux/namei.h> #include "kernfs-internal.h" /** * kernfs_create_link - create a symlink * @parent: directory to create the symlink in * @name: name of the symlink * @target: target node for the symlink to point to * * Return: the created node on success, ERR_PTR() value on error. * Ownership of the link matches ownership of the target. */ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { struct kernfs_node *kn; int error; kuid_t uid = GLOBAL_ROOT_UID; kgid_t gid = GLOBAL_ROOT_GID; if (target->iattr) { uid = target->iattr->ia_uid; gid = target->iattr->ia_gid; } kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK); if (!kn) return ERR_PTR(-ENOMEM); if (kernfs_ns_enabled(parent)) kn->ns = target->ns; kn->symlink.target_kn = target; kernfs_get(target); /* ref owned by symlink */ error = kernfs_add_one(kn); if (!error) return kn; kernfs_put(kn); return ERR_PTR(error); } static int kernfs_get_target_path(struct kernfs_node *parent, struct kernfs_node *target, char *path) { struct kernfs_node *base, *kn; char *s = path; int len = 0; /* go up to the root, stop at the base */ base = parent; while (base->parent) { kn = target->parent; while (kn->parent && base != kn) kn = kn->parent; if (base == kn) break; if ((s - path) + 3 >= PATH_MAX) return -ENAMETOOLONG; strcpy(s, "../"); s += 3; base = base->parent; } /* determine end of target string for reverse fillup */ kn = target; while (kn->parent && kn != base) { len += strlen(kn->name) + 1; kn = kn->parent; } /* check limits */ if (len < 2) return -EINVAL; len--; if ((s - path) + len >= PATH_MAX) return -ENAMETOOLONG; /* reverse fillup of target string from target to base */ kn = target; while (kn->parent && kn != base) { int slen = strlen(kn->name); len -= slen; memcpy(s + len, kn->name, slen); if (len) s[--len] = '/'; kn = kn->parent; } return 0; } static int kernfs_getlink(struct inode *inode, char *path) { struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent = kn->parent; struct kernfs_node *target = kn->symlink.target_kn; struct kernfs_root *root = kernfs_root(parent); int error; down_read(&root->kernfs_rwsem); error = kernfs_get_target_path(parent, target, path); up_read(&root->kernfs_rwsem); return error; } static const char *kernfs_iop_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { char *body; int error; if (!dentry) return ERR_PTR(-ECHILD); body = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!body) return ERR_PTR(-ENOMEM); error = kernfs_getlink(inode, body); if (unlikely(error < 0)) { kfree(body); return ERR_PTR(error); } set_delayed_call(done, kfree_link, body); return body; } const struct inode_operations kernfs_symlink_iops = { .listxattr = kernfs_iop_listxattr, .get_link = kernfs_iop_get_link, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .permission = kernfs_iop_permission, };
441 439 439 439 443 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 // SPDX-License-Identifier: GPL-2.0-or-later /* * common UDP/RAW code * Linux INET implementation * * Authors: * Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> */ #include <linux/types.h> #include <linux/module.h> #include <linux/in.h> #include <net/ip.h> #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/sock_reuseport.h> int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; struct flowi4 *fl4; struct rtable *rt; __be32 saddr; int oif; int err; if (addr_len < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; sk_dst_reset(sk); oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { if (!oif || netif_index_is_l3_master(sock_net(sk), oif)) oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); } else if (!oif) { oif = READ_ONCE(inet->uc_index); } fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif, sk->sk_protocol, inet->inet_sport, usin->sin_port, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); goto out; } if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { ip_rt_put(rt); err = -EACCES; goto out; } /* Update addresses before rehashing */ inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; if (!inet->inet_saddr) inet->inet_saddr = fl4->saddr; if (!inet->inet_rcv_saddr) { inet->inet_rcv_saddr = fl4->saddr; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); atomic_set(&inet->inet_id, get_random_u16()); sk_dst_set(sk, &rt->dst); err = 0; out: return err; } EXPORT_SYMBOL(__ip4_datagram_connect); int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip4_datagram_connect(sk, uaddr, addr_len); release_sock(sk); return res; } EXPORT_SYMBOL(ip4_datagram_connect); /* Because UDP xmit path can manipulate sk_dst_cache without holding * socket lock, we need to use sk_dst_set() here, * even if we own the socket lock. */ void ip4_datagram_release_cb(struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); struct dst_entry *dst; struct flowi4 fl4; struct rtable *rt; rcu_read_lock(); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) { rcu_read_unlock(); return; } inet_sk_init_flowi4(inet, &fl4); rt = ip_route_output_flow(sock_net(sk), &fl4, sk); dst = !IS_ERR(rt) ? &rt->dst : NULL; sk_dst_set(sk, dst); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
876 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM fib6 #if !defined(_TRACE_FIB6_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FIB6_H #include <linux/in6.h> #include <net/flow.h> #include <net/ip6_fib.h> #include <linux/tracepoint.h> TRACE_EVENT(fib6_table_lookup, TP_PROTO(const struct net *net, const struct fib6_result *res, struct fib6_table *table, const struct flowi6 *flp), TP_ARGS(net, res, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) __field( int, err ) __field( int, oif ) __field( int, iif ) __field( u32, flowlabel ) __field( __u8, tos ) __field( __u8, scope ) __field( __u8, flags ) __array( __u8, src, 16 ) __array( __u8, dst, 16 ) __field( u16, sport ) __field( u16, dport ) __field( u8, proto ) __field( u8, rt_type ) __array( char, name, IFNAMSIZ ) __array( __u8, gw, 16 ) ), TP_fast_assign( struct in6_addr *in6; __entry->tb_id = table->tb6_id; __entry->err = ip6_rt_type_to_error(res->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->flowlabel = ntohl(flowi6_get_flowlabel(flp)); __entry->tos = ip6_tclass(flp->flowlabel); __entry->scope = flp->flowi6_scope; __entry->flags = flp->flowi6_flags; in6 = (struct in6_addr *)__entry->src; *in6 = flp->saddr; in6 = (struct in6_addr *)__entry->dst; *in6 = flp->daddr; __entry->proto = flp->flowi6_proto; if (__entry->proto == IPPROTO_TCP || __entry->proto == IPPROTO_UDP) { __entry->sport = ntohs(flp->fl6_sport); __entry->dport = ntohs(flp->fl6_dport); } else { __entry->sport = 0; __entry->dport = 0; } if (res->nh && res->nh->fib_nh_dev) { strscpy(__entry->name, res->nh->fib_nh_dev->name, IFNAMSIZ); } else { strcpy(__entry->name, "-"); } if (res->f6i == net->ipv6.fib6_null_entry) { in6 = (struct in6_addr *)__entry->gw; *in6 = in6addr_any; } else if (res->nh) { in6 = (struct in6_addr *)__entry->gw; *in6 = res->nh->fib_nh_gw6; } ), TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u flowlabel %#x tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, __entry->src, __entry->sport, __entry->dst, __entry->dport, __entry->flowlabel, __entry->tos, __entry->scope, __entry->flags, __entry->name, __entry->gw, __entry->err) ); #endif /* _TRACE_FIB6_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
209 1155 201 101 43 3 2 1 70 60 147 148 2 2 73 32 108 45 1 1 4 128 95 5 148 122 214 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #ifndef _BR_PRIVATE_H #define _BR_PRIVATE_H #include <linux/netdevice.h> #include <linux/if_bridge.h> #include <linux/netpoll.h> #include <linux/u64_stats_sync.h> #include <net/route.h> #include <net/ip6_fib.h> #include <net/pkt_cls.h> #include <linux/if_vlan.h> #include <linux/rhashtable.h> #include <linux/refcount.h> #define BR_HASH_BITS 8 #define BR_HASH_SIZE (1 << BR_HASH_BITS) #define BR_HOLD_TIME (1*HZ) #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<<BR_PORT_BITS) #define BR_MULTICAST_DEFAULT_HASH_MAX 4096 #define BR_MULTICAST_QUERY_INTVL_MIN msecs_to_jiffies(1000) #define BR_MULTICAST_STARTUP_QUERY_INTVL_MIN BR_MULTICAST_QUERY_INTVL_MIN #define BR_HWDOM_MAX BITS_PER_LONG #define BR_VERSION "2.3" /* Control of forwarding link local multicast */ #define BR_GROUPFWD_DEFAULT 0 /* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */ enum { BR_GROUPFWD_STP = BIT(0), BR_GROUPFWD_MACPAUSE = BIT(1), BR_GROUPFWD_LACP = BIT(2), }; #define BR_GROUPFWD_RESTRICTED (BR_GROUPFWD_STP | BR_GROUPFWD_MACPAUSE | \ BR_GROUPFWD_LACP) /* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */ #define BR_GROUPFWD_8021AD 0xB801u /* Path to usermode spanning tree program */ #define BR_STP_PROG "/sbin/bridge-stp" #define BR_FDB_NOTIFY_SETTABLE_BITS (FDB_NOTIFY_BIT | FDB_NOTIFY_INACTIVE_BIT) typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; struct bridge_id { unsigned char prio[2]; unsigned char addr[ETH_ALEN]; }; struct mac_addr { unsigned char addr[ETH_ALEN]; }; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING /* our own querier */ struct bridge_mcast_own_query { struct timer_list timer; u32 startup_sent; }; /* other querier */ struct bridge_mcast_other_query { struct timer_list timer; struct timer_list delay_timer; }; /* selected querier */ struct bridge_mcast_querier { struct br_ip addr; int port_ifidx; seqcount_spinlock_t seq; }; /* IGMP/MLD statistics */ struct bridge_mcast_stats { struct br_mcast_stats mstats; struct u64_stats_sync syncp; }; struct br_mdb_src_entry { struct br_ip addr; }; struct br_mdb_config { struct net_bridge *br; struct net_bridge_port *p; struct br_mdb_entry *entry; struct br_ip group; bool src_entry; u8 filter_mode; u16 nlflags; struct br_mdb_src_entry *src_entries; int num_src_entries; u8 rt_protocol; }; #endif /* net_bridge_mcast_port must be always defined due to forwarding stubs */ struct net_bridge_mcast_port { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct net_bridge_port *port; struct net_bridge_vlan *vlan; struct bridge_mcast_own_query ip4_own_query; struct timer_list ip4_mc_router_timer; struct hlist_node ip4_rlist; #if IS_ENABLED(CONFIG_IPV6) struct bridge_mcast_own_query ip6_own_query; struct timer_list ip6_mc_router_timer; struct hlist_node ip6_rlist; #endif /* IS_ENABLED(CONFIG_IPV6) */ unsigned char multicast_router; u32 mdb_n_entries; u32 mdb_max_entries; #endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ }; /* net_bridge_mcast must be always defined due to forwarding stubs */ struct net_bridge_mcast { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct net_bridge *br; struct net_bridge_vlan *vlan; u32 multicast_last_member_count; u32 multicast_startup_query_count; u8 multicast_querier; u8 multicast_igmp_version; u8 multicast_router; #if IS_ENABLED(CONFIG_IPV6) u8 multicast_mld_version; #endif unsigned long multicast_last_member_interval; unsigned long multicast_membership_interval; unsigned long multicast_querier_interval; unsigned long multicast_query_interval; unsigned long multicast_query_response_interval; unsigned long multicast_startup_query_interval; struct hlist_head ip4_mc_router_list; struct timer_list ip4_mc_router_timer; struct bridge_mcast_other_query ip4_other_query; struct bridge_mcast_own_query ip4_own_query; struct bridge_mcast_querier ip4_querier; #if IS_ENABLED(CONFIG_IPV6) struct hlist_head ip6_mc_router_list; struct timer_list ip6_mc_router_timer; struct bridge_mcast_other_query ip6_other_query; struct bridge_mcast_own_query ip6_own_query; struct bridge_mcast_querier ip6_querier; #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ }; struct br_tunnel_info { __be64 tunnel_id; struct metadata_dst __rcu *tunnel_dst; }; /* private vlan flags */ enum { BR_VLFLAG_PER_PORT_STATS = BIT(0), BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1), BR_VLFLAG_MCAST_ENABLED = BIT(2), BR_VLFLAG_GLOBAL_MCAST_ENABLED = BIT(3), BR_VLFLAG_NEIGH_SUPPRESS_ENABLED = BIT(4), }; /** * struct net_bridge_vlan - per-vlan entry * * @vnode: rhashtable member * @tnode: rhashtable member * @vid: VLAN id * @flags: bridge vlan flags * @priv_flags: private (in-kernel) bridge vlan flags * @state: STP state (e.g. blocking, learning, forwarding) * @stats: per-cpu VLAN statistics * @br: if MASTER flag set, this points to a bridge struct * @port: if MASTER flag unset, this points to a port struct * @refcnt: if MASTER flag set, this is bumped for each port referencing it * @brvlan: if MASTER flag unset, this points to the global per-VLAN context * for this VLAN entry * @tinfo: bridge tunnel info * @br_mcast_ctx: if MASTER flag set, this is the global vlan multicast context * @port_mcast_ctx: if MASTER flag unset, this is the per-port/vlan multicast * context * @msti: if MASTER flag set, this holds the VLANs MST instance * @vlist: sorted list of VLAN entries * @rcu: used for entry destruction * * This structure is shared between the global per-VLAN entries contained in * the bridge rhashtable and the local per-port per-VLAN entries contained in * the port's rhashtable. The union entries should be interpreted depending on * the entry flags that are set. */ struct net_bridge_vlan { struct rhash_head vnode; struct rhash_head tnode; u16 vid; u16 flags; u16 priv_flags; u8 state; struct pcpu_sw_netstats __percpu *stats; union { struct net_bridge *br; struct net_bridge_port *port; }; union { refcount_t refcnt; struct net_bridge_vlan *brvlan; }; struct br_tunnel_info tinfo; union { struct net_bridge_mcast br_mcast_ctx; struct net_bridge_mcast_port port_mcast_ctx; }; u16 msti; struct list_head vlist; struct rcu_head rcu; }; /** * struct net_bridge_vlan_group * * @vlan_hash: VLAN entry rhashtable * @vlan_list: sorted VLAN entry list * @num_vlans: number of total VLAN entries * @pvid: PVID VLAN id * @pvid_state: PVID's STP state (e.g. forwarding, learning, blocking) * * IMPORTANT: Be careful when checking if there're VLAN entries using list * primitives because the bridge can have entries in its list which * are just for global context but not for filtering, i.e. they have * the master flag set but not the brentry flag. If you have to check * if there're "real" entries in the bridge please test @num_vlans */ struct net_bridge_vlan_group { struct rhashtable vlan_hash; struct rhashtable tunnel_hash; struct list_head vlan_list; u16 num_vlans; u16 pvid; u8 pvid_state; }; /* bridge fdb flags */ enum { BR_FDB_LOCAL, BR_FDB_STATIC, BR_FDB_STICKY, BR_FDB_ADDED_BY_USER, BR_FDB_ADDED_BY_EXT_LEARN, BR_FDB_OFFLOADED, BR_FDB_NOTIFY, BR_FDB_NOTIFY_INACTIVE, BR_FDB_LOCKED, BR_FDB_DYNAMIC_LEARNED, }; struct net_bridge_fdb_key { mac_addr addr; u16 vlan_id; }; struct net_bridge_fdb_entry { struct rhash_head rhnode; struct net_bridge_port *dst; struct net_bridge_fdb_key key; struct hlist_node fdb_node; unsigned long flags; /* write-heavy members should not affect lookups */ unsigned long updated ____cacheline_aligned_in_smp; unsigned long used; struct rcu_head rcu; }; struct net_bridge_fdb_flush_desc { unsigned long flags; unsigned long flags_mask; int port_ifindex; u16 vlan_id; }; #define MDB_PG_FLAGS_PERMANENT BIT(0) #define MDB_PG_FLAGS_OFFLOAD BIT(1) #define MDB_PG_FLAGS_FAST_LEAVE BIT(2) #define MDB_PG_FLAGS_STAR_EXCL BIT(3) #define MDB_PG_FLAGS_BLOCKED BIT(4) #define PG_SRC_ENT_LIMIT 32 #define BR_SGRP_F_DELETE BIT(0) #define BR_SGRP_F_SEND BIT(1) #define BR_SGRP_F_INSTALLED BIT(2) #define BR_SGRP_F_USER_ADDED BIT(3) struct net_bridge_mcast_gc { struct hlist_node gc_node; void (*destroy)(struct net_bridge_mcast_gc *gc); }; struct net_bridge_group_src { struct hlist_node node; struct br_ip addr; struct net_bridge_port_group *pg; u8 flags; u8 src_query_rexmit_cnt; struct timer_list timer; struct net_bridge *br; struct net_bridge_mcast_gc mcast_gc; struct rcu_head rcu; }; struct net_bridge_port_group_sg_key { struct net_bridge_port *port; struct br_ip addr; }; struct net_bridge_port_group { struct net_bridge_port_group __rcu *next; struct net_bridge_port_group_sg_key key; unsigned char eth_addr[ETH_ALEN] __aligned(2); unsigned char flags; unsigned char filter_mode; unsigned char grp_query_rexmit_cnt; unsigned char rt_protocol; struct hlist_head src_list; unsigned int src_ents; struct timer_list timer; struct timer_list rexmit_timer; struct hlist_node mglist; struct rb_root eht_set_tree; struct rb_root eht_host_tree; struct rhash_head rhnode; struct net_bridge_mcast_gc mcast_gc; struct rcu_head rcu; }; struct net_bridge_mdb_entry { struct rhash_head rhnode; struct net_bridge *br; struct net_bridge_port_group __rcu *ports; struct br_ip addr; bool host_joined; struct timer_list timer; struct hlist_node mdb_node; struct net_bridge_mcast_gc mcast_gc; struct rcu_head rcu; }; struct net_bridge_port { struct net_bridge *br; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head list; unsigned long flags; #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; #endif struct net_bridge_port __rcu *backup_port; u32 backup_nhid; /* STP */ u8 priority; u8 state; u16 port_no; unsigned char topology_change_ack; unsigned char config_pending; port_id port_id; port_id designated_port; bridge_id designated_root; bridge_id designated_bridge; u32 path_cost; u32 designated_cost; unsigned long designated_age; struct timer_list forward_delay_timer; struct timer_list hold_timer; struct timer_list message_age_timer; struct kobject kobj; struct rcu_head rcu; struct net_bridge_mcast_port multicast_ctx; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_stats __percpu *mcast_stats; u32 multicast_eht_hosts_limit; u32 multicast_eht_hosts_cnt; struct hlist_head mglist; #endif #ifdef CONFIG_SYSFS char sysfs_name[IFNAMSIZ]; #endif #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif #ifdef CONFIG_NET_SWITCHDEV /* Identifier used to group ports that share the same switchdev * hardware domain. */ int hwdom; int offload_count; struct netdev_phys_item_id ppid; #endif u16 group_fwd_mask; u16 backup_redirected_cnt; struct bridge_stp_xstats stp_xstats; }; #define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj) #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) #define br_promisc_port(p) ((p)->flags & BR_PROMISC) static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *dev) { return netif_is_bridge_port(dev) ? rtnl_dereference(dev->rx_handler_data) : NULL; } static inline struct net_bridge_port *br_port_get_rtnl_rcu(const struct net_device *dev) { return netif_is_bridge_port(dev) ? rcu_dereference_rtnl(dev->rx_handler_data) : NULL; } enum net_bridge_opts { BROPT_VLAN_ENABLED, BROPT_VLAN_STATS_ENABLED, BROPT_NF_CALL_IPTABLES, BROPT_NF_CALL_IP6TABLES, BROPT_NF_CALL_ARPTABLES, BROPT_GROUP_ADDR_SET, BROPT_MULTICAST_ENABLED, BROPT_MULTICAST_QUERY_USE_IFADDR, BROPT_MULTICAST_STATS_ENABLED, BROPT_HAS_IPV6_ADDR, BROPT_NEIGH_SUPPRESS_ENABLED, BROPT_MTU_SET_BY_USER, BROPT_VLAN_STATS_PER_PORT, BROPT_NO_LL_LEARN, BROPT_VLAN_BRIDGE_BINDING, BROPT_MCAST_VLAN_SNOOPING_ENABLED, BROPT_MST_ENABLED, }; struct net_bridge { spinlock_t lock; spinlock_t hash_lock; struct hlist_head frame_type_list; struct net_device *dev; unsigned long options; /* These fields are accessed on each packet */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING __be16 vlan_proto; u16 default_pvid; struct net_bridge_vlan_group __rcu *vlgrp; #endif struct rhashtable fdb_hash_tbl; struct list_head port_list; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) union { struct rtable fake_rtable; struct rt6_info fake_rt6_info; }; #endif u16 group_fwd_mask; u16 group_fwd_mask_required; /* STP */ bridge_id designated_root; bridge_id bridge_id; unsigned char topology_change; unsigned char topology_change_detected; u16 root_port; unsigned long max_age; unsigned long hello_time; unsigned long forward_delay; unsigned long ageing_time; unsigned long bridge_max_age; unsigned long bridge_hello_time; unsigned long bridge_forward_delay; unsigned long bridge_ageing_time; u32 root_path_cost; u8 group_addr[ETH_ALEN]; enum { BR_NO_STP, /* no spanning tree */ BR_KERNEL_STP, /* old STP in kernel */ BR_USER_STP, /* new RSTP in userspace */ } stp_enabled; struct net_bridge_mcast multicast_ctx; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_stats __percpu *mcast_stats; u32 hash_max; spinlock_t multicast_lock; struct rhashtable mdb_hash_tbl; struct rhashtable sg_port_tbl; struct hlist_head mcast_gc_list; struct hlist_head mdb_list; struct work_struct mcast_gc_work; #endif struct timer_list hello_timer; struct timer_list tcn_timer; struct timer_list topology_change_timer; struct delayed_work gc_work; struct kobject *ifobj; u32 auto_cnt; atomic_t fdb_n_learned; u32 fdb_max_learned; #ifdef CONFIG_NET_SWITCHDEV /* Counter used to make sure that hardware domains get unique * identifiers in case a bridge spans multiple switchdev instances. */ int last_hwdom; /* Bit mask of hardware domain numbers in use */ unsigned long busy_hwdoms; #endif struct hlist_head fdb_list; #if IS_ENABLED(CONFIG_BRIDGE_MRP) struct hlist_head mrp_list; #endif #if IS_ENABLED(CONFIG_BRIDGE_CFM) struct hlist_head mep_list; #endif }; struct br_input_skb_cb { struct net_device *brdev; u16 frag_max_size; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING u8 igmp; u8 mrouters_only:1; #endif u8 proxyarp_replied:1; u8 src_port_isolated:1; u8 promisc:1; #ifdef CONFIG_BRIDGE_VLAN_FILTERING u8 vlan_filtered:1; #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE u8 br_netfilter_broute:1; #endif #ifdef CONFIG_NET_SWITCHDEV /* Set if TX data plane offloading is used towards at least one * hardware domain. */ u8 tx_fwd_offload:1; /* The switchdev hardware domain from which this packet was received. * If skb->offload_fwd_mark was set, then this packet was already * forwarded by hardware to the other ports in the source hardware * domain, otherwise it wasn't. */ int src_hwdom; /* Bit mask of hardware domains towards this packet has already been * transmitted using the TX data plane offload. */ unsigned long fwd_hwdoms; #endif u32 backup_nhid; }; #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) #ifdef CONFIG_BRIDGE_IGMP_SNOOPING # define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (BR_INPUT_SKB_CB(__skb)->mrouters_only) #else # define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (0) #endif #define br_printk(level, br, format, args...) \ printk(level "%s: " format, (br)->dev->name, ##args) #define br_err(__br, format, args...) \ br_printk(KERN_ERR, __br, format, ##args) #define br_warn(__br, format, args...) \ br_printk(KERN_WARNING, __br, format, ##args) #define br_notice(__br, format, args...) \ br_printk(KERN_NOTICE, __br, format, ##args) #define br_info(__br, format, args...) \ br_printk(KERN_INFO, __br, format, ##args) #define br_debug(br, format, args...) \ pr_debug("%s: " format, (br)->dev->name, ##args) /* called under bridge lock */ static inline int br_is_root_bridge(const struct net_bridge *br) { return !memcmp(&br->bridge_id, &br->designated_root, 8); } /* check if a VLAN entry is global */ static inline bool br_vlan_is_master(const struct net_bridge_vlan *v) { return v->flags & BRIDGE_VLAN_INFO_MASTER; } /* check if a VLAN entry is used by the bridge */ static inline bool br_vlan_is_brentry(const struct net_bridge_vlan *v) { return v->flags & BRIDGE_VLAN_INFO_BRENTRY; } /* check if we should use the vlan entry, returns false if it's only context */ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v) { if (br_vlan_is_master(v)) { if (br_vlan_is_brentry(v)) return true; else return false; } return true; } static inline bool nbp_state_should_learn(const struct net_bridge_port *p) { return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING; } static inline bool br_vlan_valid_id(u16 vid, struct netlink_ext_ack *extack) { bool ret = vid > 0 && vid < VLAN_VID_MASK; if (!ret) NL_SET_ERR_MSG_MOD(extack, "Vlan id is invalid"); return ret; } static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur, const struct bridge_vlan_info *last, struct netlink_ext_ack *extack) { /* pvid flag is not allowed in ranges */ if (cur->flags & BRIDGE_VLAN_INFO_PVID) { NL_SET_ERR_MSG_MOD(extack, "Pvid isn't allowed in a range"); return false; } /* when cur is the range end, check if: * - it has range start flag * - range ids are invalid (end is equal to or before start) */ if (last) { if (cur->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { NL_SET_ERR_MSG_MOD(extack, "Found a new vlan range start while processing one"); return false; } else if (!(cur->flags & BRIDGE_VLAN_INFO_RANGE_END)) { NL_SET_ERR_MSG_MOD(extack, "Vlan range end flag is missing"); return false; } else if (cur->vid <= last->vid) { NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id"); return false; } } /* check for required range flags */ if (!(cur->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | BRIDGE_VLAN_INFO_RANGE_END))) { NL_SET_ERR_MSG_MOD(extack, "Both vlan range flags are missing"); return false; } return true; } static inline u8 br_vlan_multicast_router(const struct net_bridge_vlan *v) { u8 mcast_router = MDB_RTR_TYPE_DISABLED; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (!br_vlan_is_master(v)) mcast_router = v->port_mcast_ctx.multicast_router; else mcast_router = v->br_mcast_ctx.multicast_router; #endif return mcast_router; } static inline int br_afspec_cmd_to_rtm(int cmd) { switch (cmd) { case RTM_SETLINK: return RTM_NEWVLAN; case RTM_DELLINK: return RTM_DELVLAN; } return 0; } static inline int br_opt_get(const struct net_bridge *br, enum net_bridge_opts opt) { return test_bit(opt, &br->options); } int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, struct netlink_ext_ack *extack); int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt); int br_boolopt_multi_toggle(struct net_bridge *br, struct br_boolopt_multi *bm, struct netlink_ext_ack *extack); void br_boolopt_multi_get(const struct net_bridge *br, struct br_boolopt_multi *bm); void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on); #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss) { struct tc_skb_ext *ext; if (!tc_skb_ext_tc_enabled()) return; ext = skb_ext_find(skb, TC_SKB_EXT); if (ext) { ext->l2_miss = miss; return; } if (!miss) return; ext = tc_skb_ext_alloc(skb); if (!ext) return; ext->l2_miss = true; } #else static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss) { } #endif /* br_device.c */ void br_dev_setup(struct net_device *dev); void br_dev_delete(struct net_device *dev, struct list_head *list); netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev); #ifdef CONFIG_NET_POLL_CONTROLLER static inline void br_netpoll_send_skb(const struct net_bridge_port *p, struct sk_buff *skb) { netpoll_send_skb(p->np, skb); } int br_netpoll_enable(struct net_bridge_port *p); void br_netpoll_disable(struct net_bridge_port *p); #else static inline void br_netpoll_send_skb(const struct net_bridge_port *p, struct sk_buff *skb) { } static inline int br_netpoll_enable(struct net_bridge_port *p) { return 0; } static inline void br_netpoll_disable(struct net_bridge_port *p) { } #endif /* br_fdb.c */ #define FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF) #define FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP) #define FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_USE | NTF_EXT_LEARNED | \ NTF_STICKY | NTF_OFFLOADED) int br_fdb_init(void); void br_fdb_fini(void); int br_fdb_hash_init(struct net_bridge *br); void br_fdb_hash_fini(struct net_bridge *br); void br_fdb_flush(struct net_bridge *br, const struct net_bridge_fdb_flush_desc *desc); void br_fdb_find_delete_local(struct net_bridge *br, const struct net_bridge_port *p, const unsigned char *addr, u16 vid); void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(struct work_struct *work); void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, const unsigned char *addr, __u16 vid); int br_fdb_test_addr(struct net_device *dev, unsigned char *addr); int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, unsigned long off); int br_fdb_add_local(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack); int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, bool *notified, struct netlink_ext_ack *extack); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *fdev, int *idx); int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack); int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool locked, bool swdev_notify); int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool swdev_notify); void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool offloaded); /* br_forward.c */ enum br_pkt_type { BR_PKT_UNICAST, BR_PKT_MULTICAST, BR_PKT_BROADCAST }; int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb); void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, bool local_rcv, bool local_orig); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb); void br_flood(struct net_bridge *br, struct sk_buff *skb, enum br_pkt_type pkt_type, bool local_rcv, bool local_orig, u16 vid); /* return true if both source port and dest port are isolated */ static inline bool br_skb_isolated(const struct net_bridge_port *to, const struct sk_buff *skb) { return BR_INPUT_SKB_CB(skb)->src_port_isolated && (to->flags & BR_ISOLATED); } /* br_if.c */ void br_port_carrier_check(struct net_bridge_port *p, bool *notified); int br_add_bridge(struct net *net, const char *name); int br_del_bridge(struct net *net, const char *name); int br_add_if(struct net_bridge *br, struct net_device *dev, struct netlink_ext_ack *extack); int br_del_if(struct net_bridge *br, struct net_device *dev); void br_mtu_auto_adjust(struct net_bridge *br); netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features); void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); void br_manage_promisc(struct net_bridge *br); int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev); /* br_input.c */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); rx_handler_func_t *br_get_rx_handler(const struct net_device *dev); struct br_frame_type { __be16 type; int (*frame_handler)(struct net_bridge_port *port, struct sk_buff *skb); struct hlist_node list; }; void br_add_frame(struct net_bridge *br, struct br_frame_type *ft); void br_del_frame(struct net_bridge *br, struct br_frame_type *ft); static inline bool br_rx_handler_check_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler) == br_get_rx_handler(dev); } static inline bool br_rx_handler_check_rtnl(const struct net_device *dev) { return rcu_dereference_rtnl(dev->rx_handler) == br_get_rx_handler(dev); } static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev) { return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL; } static inline struct net_bridge_port * br_port_get_check_rtnl(const struct net_device *dev) { return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL; } /* br_ioctl.c */ int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd); int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, struct ifreq *ifr, void __user *uarg); /* br_multicast.c */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid); struct net_bridge_mdb_entry * br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid); int br_multicast_add_port(struct net_bridge_port *port); void br_multicast_del_port(struct net_bridge_port *port); void br_multicast_enable_port(struct net_bridge_port *port); void br_multicast_disable_port(struct net_bridge_port *port); void br_multicast_init(struct net_bridge *br); void br_multicast_join_snoopers(struct net_bridge *br); void br_multicast_leave_snoopers(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); void br_multicast_dev_del(struct net_bridge *br); void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig); int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val); int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, unsigned long val); int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router); int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val); int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, unsigned long val); #if IS_ENABLED(CONFIG_IPV6) int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx, unsigned long val); #endif struct net_bridge_mdb_entry * br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst); struct net_bridge_mdb_entry * br_multicast_new_group(struct net_bridge *br, struct br_ip *group); struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, const struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, u8 filter_mode, u8 rt_protocol, struct netlink_ext_ack *extack); void br_multicast_del_port_group(struct net_bridge_port_group *p); int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type); void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type); void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_port_group __rcu **pp); void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir); int br_multicast_init_stats(struct net_bridge *br); void br_multicast_uninit_stats(struct net_bridge *br); void br_multicast_get_stats(const struct net_bridge *br, const struct net_bridge_port *p, struct br_mcast_stats *dest); u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx); void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max); u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx); int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack); int br_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int br_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack); void br_multicast_host_join(const struct net_bridge_mcast *brmctx, struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, u8 filter_mode); void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg); struct net_bridge_group_src * br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip); struct net_bridge_group_src * br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip); void __br_multicast_del_group_src(struct net_bridge_group_src *src); void br_multicast_del_group_src(struct net_bridge_group_src *src, bool fastleave); void br_multicast_ctx_init(struct net_bridge *br, struct net_bridge_vlan *vlan, struct net_bridge_mcast *brmctx); void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx); void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx); void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx); void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on); int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack); bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on); int br_rports_fill_info(struct sk_buff *skb, const struct net_bridge_mcast *brmctx); int br_multicast_dump_querier_state(struct sk_buff *skb, const struct net_bridge_mcast *brmctx, int nest_attr); size_t br_multicast_querier_state_size(void); size_t br_rports_size(const struct net_bridge_mcast *brmctx); void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx, unsigned long val); void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx, unsigned long val); static inline bool br_group_is_l2(const struct br_ip *group) { return group->proto == 0; } #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) static inline struct hlist_node * br_multicast_get_first_rport_node(struct net_bridge_mcast *brmctx, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) if (skb->protocol == htons(ETH_P_IPV6)) return rcu_dereference(hlist_first_rcu(&brmctx->ip6_mc_router_list)); #endif return rcu_dereference(hlist_first_rcu(&brmctx->ip4_mc_router_list)); } static inline struct net_bridge_port * br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb) { struct net_bridge_mcast_port *mctx; #if IS_ENABLED(CONFIG_IPV6) if (skb->protocol == htons(ETH_P_IPV6)) mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port, ip6_rlist); else #endif mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port, ip4_rlist); if (mctx) return mctx->port; else return NULL; } static inline bool br_ip4_multicast_is_router(struct net_bridge_mcast *brmctx) { return timer_pending(&brmctx->ip4_mc_router_timer); } static inline bool br_ip6_multicast_is_router(struct net_bridge_mcast *brmctx) { #if IS_ENABLED(CONFIG_IPV6) return timer_pending(&brmctx->ip6_mc_router_timer); #else return false; #endif } static inline bool br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb) { switch (brmctx->multicast_router) { case MDB_RTR_TYPE_PERM: return true; case MDB_RTR_TYPE_TEMP_QUERY: if (skb) { if (skb->protocol == htons(ETH_P_IP)) return br_ip4_multicast_is_router(brmctx); else if (skb->protocol == htons(ETH_P_IPV6)) return br_ip6_multicast_is_router(brmctx); } else { return br_ip4_multicast_is_router(brmctx) || br_ip6_multicast_is_router(brmctx); } fallthrough; default: return false; } } static inline bool __br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct bridge_mcast_other_query *querier, const bool is_ipv6) { bool own_querier_enabled; if (brmctx->multicast_querier) { if (is_ipv6 && !br_opt_get(brmctx->br, BROPT_HAS_IPV6_ADDR)) own_querier_enabled = false; else own_querier_enabled = true; } else { own_querier_enabled = false; } return !timer_pending(&querier->delay_timer) && (own_querier_enabled || timer_pending(&querier->timer)); } static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct ethhdr *eth, const struct net_bridge_mdb_entry *mdb) { switch (eth->h_proto) { case (htons(ETH_P_IP)): return __br_multicast_querier_exists(brmctx, &brmctx->ip4_other_query, false); #if IS_ENABLED(CONFIG_IPV6) case (htons(ETH_P_IPV6)): return __br_multicast_querier_exists(brmctx, &brmctx->ip6_other_query, true); #endif default: return !!mdb && br_group_is_l2(&mdb->addr); } } static inline bool br_multicast_is_star_g(const struct br_ip *ip) { switch (ip->proto) { case htons(ETH_P_IP): return ipv4_is_zeronet(ip->src.ip4); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): return ipv6_addr_any(&ip->src.ip6); #endif default: return false; } } static inline bool br_multicast_should_handle_mode(const struct net_bridge_mcast *brmctx, __be16 proto) { switch (proto) { case htons(ETH_P_IP): return !!(brmctx->multicast_igmp_version == 3); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): return !!(brmctx->multicast_mld_version == 2); #endif default: return false; } } static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return BR_INPUT_SKB_CB(skb)->igmp; } static inline unsigned long br_multicast_lmqt(const struct net_bridge_mcast *brmctx) { return brmctx->multicast_last_member_interval * brmctx->multicast_last_member_count; } static inline unsigned long br_multicast_gmi(const struct net_bridge_mcast *brmctx) { return brmctx->multicast_membership_interval; } static inline bool br_multicast_ctx_is_vlan(const struct net_bridge_mcast *brmctx) { return !!brmctx->vlan; } static inline bool br_multicast_port_ctx_is_vlan(const struct net_bridge_mcast_port *pmctx) { return !!pmctx->vlan; } static inline struct net_bridge_mcast * br_multicast_port_ctx_get_global(const struct net_bridge_mcast_port *pmctx) { if (!br_multicast_port_ctx_is_vlan(pmctx)) return &pmctx->port->br->multicast_ctx; else return &pmctx->vlan->brvlan->br_mcast_ctx; } static inline bool br_multicast_ctx_vlan_global_disabled(const struct net_bridge_mcast *brmctx) { return br_multicast_ctx_is_vlan(brmctx) && (!br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) || !(brmctx->vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)); } static inline bool br_multicast_ctx_vlan_disabled(const struct net_bridge_mcast *brmctx) { return br_multicast_ctx_is_vlan(brmctx) && !(brmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED); } static inline bool br_multicast_port_ctx_vlan_disabled(const struct net_bridge_mcast_port *pmctx) { return br_multicast_port_ctx_is_vlan(pmctx) && !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED); } static inline bool br_multicast_port_ctx_state_disabled(const struct net_bridge_mcast_port *pmctx) { return pmctx->port->state == BR_STATE_DISABLED || (br_multicast_port_ctx_is_vlan(pmctx) && (br_multicast_port_ctx_vlan_disabled(pmctx) || pmctx->vlan->state == BR_STATE_DISABLED)); } static inline bool br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx) { return br_multicast_port_ctx_state_disabled(pmctx) || pmctx->port->state == BR_STATE_BLOCKING || (br_multicast_port_ctx_is_vlan(pmctx) && pmctx->vlan->state == BR_STATE_BLOCKING); } static inline bool br_rports_have_mc_router(const struct net_bridge_mcast *brmctx) { #if IS_ENABLED(CONFIG_IPV6) return !hlist_empty(&brmctx->ip4_mc_router_list) || !hlist_empty(&brmctx->ip6_mc_router_list); #else return !hlist_empty(&brmctx->ip4_mc_router_list); #endif } static inline bool br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1, const struct net_bridge_mcast *brmctx2) { return brmctx1->multicast_igmp_version == brmctx2->multicast_igmp_version && brmctx1->multicast_last_member_count == brmctx2->multicast_last_member_count && brmctx1->multicast_startup_query_count == brmctx2->multicast_startup_query_count && brmctx1->multicast_last_member_interval == brmctx2->multicast_last_member_interval && brmctx1->multicast_membership_interval == brmctx2->multicast_membership_interval && brmctx1->multicast_querier_interval == brmctx2->multicast_querier_interval && brmctx1->multicast_query_interval == brmctx2->multicast_query_interval && brmctx1->multicast_query_response_interval == brmctx2->multicast_query_response_interval && brmctx1->multicast_startup_query_interval == brmctx2->multicast_startup_query_interval && brmctx1->multicast_querier == brmctx2->multicast_querier && brmctx1->multicast_router == brmctx2->multicast_router && !br_rports_have_mc_router(brmctx1) && !br_rports_have_mc_router(brmctx2) && #if IS_ENABLED(CONFIG_IPV6) brmctx1->multicast_mld_version == brmctx2->multicast_mld_version && #endif true; } static inline bool br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) { bool vlan_snooping_enabled; vlan_snooping_enabled = !!br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx)); } #else static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid) { return 0; } static inline struct net_bridge_mdb_entry * br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid) { return NULL; } static inline int br_multicast_add_port(struct net_bridge_port *port) { return 0; } static inline void br_multicast_del_port(struct net_bridge_port *port) { } static inline void br_multicast_enable_port(struct net_bridge_port *port) { } static inline void br_multicast_disable_port(struct net_bridge_port *port) { } static inline void br_multicast_init(struct net_bridge *br) { } static inline void br_multicast_join_snoopers(struct net_bridge *br) { } static inline void br_multicast_leave_snoopers(struct net_bridge *br) { } static inline void br_multicast_open(struct net_bridge *br) { } static inline void br_multicast_stop(struct net_bridge *br) { } static inline void br_multicast_dev_del(struct net_bridge *br) { } static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { } static inline bool br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb) { return false; } static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct ethhdr *eth, const struct net_bridge_mdb_entry *mdb) { return false; } static inline int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { return 0; } static inline int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_mdb_hash_init(struct net_bridge *br) { return 0; } static inline void br_mdb_hash_fini(struct net_bridge *br) { } static inline void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir) { } static inline int br_multicast_init_stats(struct net_bridge *br) { return 0; } static inline void br_multicast_uninit_stats(struct net_bridge *br) { } static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return 0; } static inline void br_multicast_ctx_init(struct net_bridge *br, struct net_bridge_vlan *vlan, struct net_bridge_mcast *brmctx) { } static inline void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx) { } static inline void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx) { } static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) { } static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { } static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on) { return false; } static inline bool br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1, const struct net_bridge_mcast *brmctx2) { return true; } #endif /* br_vlan.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan); bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb); bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_bridge_port *port, struct net_bridge_vlan_group *vg, struct sk_buff *skb); int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack); int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid); void br_recalculate_fwd_mask(struct net_bridge *br); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, struct netlink_ext_ack *extack); int br_vlan_set_proto(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int br_vlan_set_stats(struct net_bridge *br, unsigned long val); int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, struct netlink_ext_ack *extack); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack); int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); void br_vlan_get_stats(const struct net_bridge_vlan *v, struct pcpu_sw_netstats *stats); void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr); void br_vlan_vlan_upper_event(struct net_device *br_dev, struct net_device *vlan_dev, unsigned long event); int br_vlan_rtnl_init(void); void br_vlan_rtnl_uninit(void); void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, u16 vid, u16 vid_range, int cmd); bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end); void br_vlan_fill_forward_path_pvid(struct net_bridge *br, struct net_device_path_ctx *ctx, struct net_device_path *path); int br_vlan_fill_forward_path_mode(struct net_bridge *br, struct net_bridge_port *dst, struct net_device_path *path); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) { return rtnl_dereference(br->vlgrp); } static inline struct net_bridge_vlan_group *nbp_vlan_group( const struct net_bridge_port *p) { return rtnl_dereference(p->vlgrp); } static inline struct net_bridge_vlan_group *br_vlan_group_rcu( const struct net_bridge *br) { return rcu_dereference(br->vlgrp); } static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( const struct net_bridge_port *p) { return rcu_dereference(p->vlgrp); } /* Since bridge now depends on 8021Q module, but the time bridge sees the * skb, the vlan tag will always be present if the frame was tagged. */ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) { int err = 0; if (skb_vlan_tag_present(skb)) { *vid = skb_vlan_tag_get_id(skb); } else { *vid = 0; err = -EINVAL; } return err; } static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) { if (!vg) return 0; smp_rmb(); return vg->pvid; } static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) { return v->vid == pvid ? v->flags | BRIDGE_VLAN_INFO_PVID : v->flags; } #else static inline bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { *vlan = NULL; return true; } static inline bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb) { return true; } static inline bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { return true; } static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_bridge_port *port, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { return skb; } static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { *changed = false; return -EOPNOTSUPP; } static inline int br_vlan_delete(struct net_bridge *br, u16 vid) { return -EOPNOTSUPP; } static inline void br_vlan_flush(struct net_bridge *br) { } static inline void br_recalculate_fwd_mask(struct net_bridge *br) { } static inline int br_vlan_init(struct net_bridge *br) { return 0; } static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { *changed = false; return -EOPNOTSUPP; } static inline int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) { return -EOPNOTSUPP; } static inline void nbp_vlan_flush(struct net_bridge_port *port) { } static inline struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) { return NULL; } static inline int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack) { return 0; } static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag) { return 0; } static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) { return 0; } static inline int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask) { return 0; } static inline void br_vlan_fill_forward_path_pvid(struct net_bridge *br, struct net_device_path_ctx *ctx, struct net_device_path *path) { } static inline int br_vlan_fill_forward_path_mode(struct net_bridge *br, struct net_bridge_port *dst, struct net_device_path *path) { return 0; } static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) { return NULL; } static inline struct net_bridge_vlan_group *nbp_vlan_group( const struct net_bridge_port *p) { return NULL; } static inline struct net_bridge_vlan_group *br_vlan_group_rcu( const struct net_bridge *br) { return NULL; } static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( const struct net_bridge_port *p) { return NULL; } static inline void br_vlan_get_stats(const struct net_bridge_vlan *v, struct pcpu_sw_netstats *stats) { } static inline void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) { } static inline int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) { return 0; } static inline void br_vlan_vlan_upper_event(struct net_device *br_dev, struct net_device *vlan_dev, unsigned long event) { } static inline int br_vlan_rtnl_init(void) { return 0; } static inline void br_vlan_rtnl_uninit(void) { } static inline void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, u16 vid, u16 vid_range, int cmd) { } static inline bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { return true; } static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) { return 0; } #endif /* br_vlan_options.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end); bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v, const struct net_bridge_port *p); size_t br_vlan_opts_nl_size(void); int br_vlan_process_options(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan *range_start, struct net_bridge_vlan *range_end, struct nlattr **tb, struct netlink_ext_ack *extack); int br_vlan_rtm_process_global_options(struct net_device *dev, const struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *r_end); bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts); /* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) { return READ_ONCE(v->state); } static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state) { WRITE_ONCE(v->state, state); } static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg) { return READ_ONCE(vg->pvid_state); } static inline void br_vlan_set_pvid_state(struct net_bridge_vlan_group *vg, u8 state) { WRITE_ONCE(vg->pvid_state, state); } /* learn_allow is true at ingress and false at egress */ static inline bool br_vlan_state_allowed(u8 state, bool learn_allow) { switch (state) { case BR_STATE_LEARNING: return learn_allow; case BR_STATE_FORWARDING: return true; default: return false; } } #endif /* br_mst.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING DECLARE_STATIC_KEY_FALSE(br_mst_used); static inline bool br_mst_is_enabled(struct net_bridge *br) { return static_branch_unlikely(&br_mst_used) && br_opt_get(br, BROPT_MST_ENABLED); } int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, struct netlink_ext_ack *extack); int br_mst_vlan_set_msti(struct net_bridge_vlan *v, u16 msti); void br_mst_vlan_init_state(struct net_bridge_vlan *v); int br_mst_set_enabled(struct net_bridge *br, bool on, struct netlink_ext_ack *extack); size_t br_mst_info_size(const struct net_bridge_vlan_group *vg); int br_mst_fill_info(struct sk_buff *skb, const struct net_bridge_vlan_group *vg); int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr, struct netlink_ext_ack *extack); #else static inline bool br_mst_is_enabled(struct net_bridge *br) { return false; } static inline int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_mst_set_enabled(struct net_bridge *br, bool on, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline size_t br_mst_info_size(const struct net_bridge_vlan_group *vg) { return 0; } static inline int br_mst_fill_info(struct sk_buff *skb, const struct net_bridge_vlan_group *vg) { return -EOPNOTSUPP; } static inline int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } #endif struct nf_br_ops { int (*br_dev_xmit_hook)(struct sk_buff *skb); }; extern const struct nf_br_ops __rcu *nf_br_ops; /* br_netfilter.c */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int br_nf_core_init(void); void br_nf_core_fini(void); void br_netfilter_rtable_init(struct net_bridge *); #else static inline int br_nf_core_init(void) { return 0; } static inline void br_nf_core_fini(void) {} #define br_netfilter_rtable_init(x) #endif /* br_stp.c */ void br_set_state(struct net_bridge_port *p, unsigned int state); struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no); void br_init_port(struct net_bridge_port *p); void br_become_designated_port(struct net_bridge_port *p); void __br_set_forward_delay(struct net_bridge *br, unsigned long t); int br_set_forward_delay(struct net_bridge *br, unsigned long x); int br_set_hello_time(struct net_bridge *br, unsigned long x); int br_set_max_age(struct net_bridge *br, unsigned long x); int __set_ageing_time(struct net_device *dev, unsigned long t); int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time); /* br_stp_if.c */ void br_stp_enable_bridge(struct net_bridge *br); void br_stp_disable_bridge(struct net_bridge *br); int br_stp_set_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); void br_stp_enable_port(struct net_bridge_port *p); void br_stp_disable_port(struct net_bridge_port *p); bool br_stp_recalculate_bridge_id(struct net_bridge *br); void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a); void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio); int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio); int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost); ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id); /* br_stp_bpdu.c */ struct stp_proto; void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb, struct net_device *dev); /* br_stp_timer.c */ void br_stp_timer_init(struct net_bridge *br); void br_stp_port_timer_init(struct net_bridge_port *p); unsigned long br_timer_value(const struct timer_list *timer); /* br.c */ #if IS_ENABLED(CONFIG_ATM_LANE) extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr); #endif /* br_mrp.c */ #if IS_ENABLED(CONFIG_BRIDGE_MRP) int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); bool br_mrp_enabled(struct net_bridge *br); void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p); int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br); #else static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline bool br_mrp_enabled(struct net_bridge *br) { return false; } static inline void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p) { } static inline int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br) { return 0; } #endif /* br_cfm.c */ #if IS_ENABLED(CONFIG_BRIDGE_CFM) int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); bool br_cfm_created(struct net_bridge *br); void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *p); int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br); int br_cfm_status_fill_info(struct sk_buff *skb, struct net_bridge *br, bool getlink); int br_cfm_mep_count(struct net_bridge *br, u32 *count); int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count); #else static inline int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline bool br_cfm_created(struct net_bridge *br) { return false; } static inline void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *p) { } static inline int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br) { return -EOPNOTSUPP; } static inline int br_cfm_status_fill_info(struct sk_buff *skb, struct net_bridge *br, bool getlink) { return -EOPNOTSUPP; } static inline int br_cfm_mep_count(struct net_bridge *br, u32 *count) { *count = 0; return -EOPNOTSUPP; } static inline int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count) { *count = 0; return -EOPNOTSUPP; } #endif /* br_netlink.c */ extern struct rtnl_link_ops br_link_ops; int br_netlink_init(void); void br_netlink_fini(void); void br_ifinfo_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port); void br_info_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port, u32 filter); int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags, struct netlink_ext_ack *extack); int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags); int br_process_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo_curr, struct bridge_vlan_info **vinfo_last, bool *changed, struct netlink_ext_ack *extack); #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ extern const struct sysfs_ops brport_sysfs_ops; int br_sysfs_addif(struct net_bridge_port *p); int br_sysfs_renameif(struct net_bridge_port *p); /* br_sysfs_br.c */ int br_sysfs_addbr(struct net_device *dev); void br_sysfs_delbr(struct net_device *dev); #else static inline int br_sysfs_addif(struct net_bridge_port *p) { return 0; } static inline int br_sysfs_renameif(struct net_bridge_port *p) { return 0; } static inline int br_sysfs_addbr(struct net_device *dev) { return 0; } static inline void br_sysfs_delbr(struct net_device *dev) { return; } #endif /* CONFIG_SYSFS */ /* br_switchdev.c */ #ifdef CONFIG_NET_SWITCHDEV int br_switchdev_port_offload(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, bool tx_fwd_offload, struct netlink_ext_ack *extack); void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb); int br_switchdev_port_replay(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, struct netlink_ext_ack *extack); bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb); void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb); void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, struct sk_buff *skb); void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, struct sk_buff *skb); void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb); bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb); int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, unsigned long mask, struct netlink_ext_ack *extack); void br_switchdev_fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type); void br_switchdev_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type); int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, bool changed, struct netlink_ext_ack *extack); int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid); void br_switchdev_init(struct net_bridge *br); static inline void br_switchdev_frame_unmark(struct sk_buff *skb) { skb->offload_fwd_mark = 0; } #else static inline int br_switchdev_port_offload(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, bool tx_fwd_offload, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb) { } static inline int br_switchdev_port_replay(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { return false; } static inline void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb) { } static inline void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, struct sk_buff *skb) { } static inline void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, struct sk_buff *skb) { } static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb) { } static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb) { return true; } static inline int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, unsigned long mask, struct netlink_ext_ack *extack) { return 0; } static inline int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, bool changed, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) { return -EOPNOTSUPP; } static inline void br_switchdev_fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type) { } static inline void br_switchdev_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { } static inline void br_switchdev_frame_unmark(struct sk_buff *skb) { } static inline void br_switchdev_init(struct net_bridge *br) { } #endif /* CONFIG_NET_SWITCHDEV */ /* br_arp_nd_proxy.c */ void br_recalculate_neigh_suppress_enabled(struct net_bridge *br); void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, u16 vid, struct net_bridge_port *p); void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, u16 vid, struct net_bridge_port *p, struct nd_msg *msg); struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *m); bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid); #endif
690 692 687 58 177 176 18 501 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) /* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #include <linux/siphash.h> #include <linux/unaligned.h> #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 #include <linux/dcache.h> #include <asm/word-at-a-time.h> #endif #define SIPROUND SIPHASH_PERMUTATION(v0, v1, v2, v3) #define PREAMBLE(len) \ u64 v0 = SIPHASH_CONST_0; \ u64 v1 = SIPHASH_CONST_1; \ u64 v2 = SIPHASH_CONST_2; \ u64 v3 = SIPHASH_CONST_3; \ u64 b = ((u64)(len)) << 56; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define POSTAMBLE \ v3 ^= b; \ SIPROUND; \ SIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_aligned); #endif u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_unaligned); /** * siphash_1u64 - compute 64-bit siphash PRF value of a u64 * @first: first u64 * @key: the siphash key */ u64 siphash_1u64(const u64 first, const siphash_key_t *key) { PREAMBLE(8) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u64); /** * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 * @first: first u64 * @second: second u64 * @key: the siphash key */ u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key) { PREAMBLE(16) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; POSTAMBLE } EXPORT_SYMBOL(siphash_2u64); /** * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @key: the siphash key */ u64 siphash_3u64(const u64 first, const u64 second, const u64 third, const siphash_key_t *key) { PREAMBLE(24) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u64); /** * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @forth: forth u64 * @key: the siphash key */ u64 siphash_4u64(const u64 first, const u64 second, const u64 third, const u64 forth, const siphash_key_t *key) { PREAMBLE(32) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; v3 ^= forth; SIPROUND; SIPROUND; v0 ^= forth; POSTAMBLE } EXPORT_SYMBOL(siphash_4u64); u64 siphash_1u32(const u32 first, const siphash_key_t *key) { PREAMBLE(4) b |= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u32); u64 siphash_3u32(const u32 first, const u32 second, const u32 third, const siphash_key_t *key) { u64 combined = (u64)second << 32 | first; PREAMBLE(12) v3 ^= combined; SIPROUND; SIPROUND; v0 ^= combined; b |= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u32); #if BITS_PER_LONG == 64 /* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3. */ #define HSIPROUND SIPROUND #define HPREAMBLE(len) PREAMBLE(len) #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) b |= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(8) v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(12) v3 ^= combined; HSIPROUND; v0 ^= combined; b |= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(16) v3 ^= combined; HSIPROUND; v0 ^= combined; combined = (u64)forth << 32 | third; v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #else #define HSIPROUND HSIPHASH_PERMUTATION(v0, v1, v2, v3) #define HPREAMBLE(len) \ u32 v0 = HSIPHASH_CONST_0; \ u32 v1 = HSIPHASH_CONST_1; \ u32 v2 = HSIPHASH_CONST_2; \ u32 v3 = HSIPHASH_CONST_3; \ u32 b = ((u32)(len)) << 24; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return v1 ^ v3; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = le32_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = get_unaligned_le32(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) v3 ^= first; HSIPROUND; v0 ^= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { HPREAMBLE(8) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { HPREAMBLE(12) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { HPREAMBLE(16) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; v3 ^= forth; HSIPROUND; v0 ^= forth; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Reiner Sailer <sailer@watson.ibm.com> * Mimi Zohar <zohar@us.ibm.com> * * File: ima.h * internal Integrity Measurement Architecture (IMA) definitions */ #ifndef __LINUX_IMA_H #define __LINUX_IMA_H #include <linux/types.h> #include <linux/crypto.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/hash.h> #include <linux/tpm.h> #include <linux/audit.h> #include <crypto/hash_info.h> #include "../integrity.h" enum ima_show_type { IMA_SHOW_BINARY, IMA_SHOW_BINARY_NO_FIELD_LEN, IMA_SHOW_BINARY_OLD_STRING_FMT, IMA_SHOW_ASCII }; enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8, TPM_PCR10 = 10 }; /* digest size for IMA, fits SHA1 or MD5 */ #define IMA_DIGEST_SIZE SHA1_DIGEST_SIZE #define IMA_EVENT_NAME_LEN_MAX 255 #define IMA_HASH_BITS 10 #define IMA_MEASURE_HTABLE_SIZE (1 << IMA_HASH_BITS) #define IMA_TEMPLATE_FIELD_ID_MAX_LEN 16 #define IMA_TEMPLATE_NUM_FIELDS_MAX 15 #define IMA_TEMPLATE_IMA_NAME "ima" #define IMA_TEMPLATE_IMA_FMT "d|n" #define NR_BANKS(chip) ((chip != NULL) ? chip->nr_allocated_banks : 0) /* current content of the policy */ extern int ima_policy_flag; /* bitset of digests algorithms allowed in the setxattr hook */ extern atomic_t ima_setxattr_allowed_hash_algorithms; /* IMA hash algorithm description */ struct ima_algo_desc { struct crypto_shash *tfm; enum hash_algo algo; }; /* set during initialization */ extern int ima_hash_algo __ro_after_init; extern int ima_sha1_idx __ro_after_init; extern int ima_hash_algo_idx __ro_after_init; extern int ima_extra_slots __ro_after_init; extern struct ima_algo_desc *ima_algo_array __ro_after_init; extern int ima_appraise; extern struct tpm_chip *ima_tpm_chip; extern const char boot_aggregate_name[]; /* IMA event related data */ struct ima_event_data { struct ima_iint_cache *iint; struct file *file; const unsigned char *filename; struct evm_ima_xattr_data *xattr_value; int xattr_len; const struct modsig *modsig; const char *violation; const void *buf; int buf_len; }; /* IMA template field data definition */ struct ima_field_data { u8 *data; u32 len; }; /* IMA template field definition */ struct ima_template_field { const char field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN]; int (*field_init)(struct ima_event_data *event_data, struct ima_field_data *field_data); void (*field_show)(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data); }; /* IMA template descriptor definition */ struct ima_template_desc { struct list_head list; char *name; char *fmt; int num_fields; const struct ima_template_field **fields; }; struct ima_template_entry { int pcr; struct tpm_digest *digests; struct ima_template_desc *template_desc; /* template descriptor */ u32 template_data_len; struct ima_field_data template_data[]; /* template related data */ }; struct ima_queue_entry { struct hlist_node hnext; /* place in hash collision list */ struct list_head later; /* place in ima_measurements list */ struct ima_template_entry *entry; }; extern struct list_head ima_measurements; /* list of all measurements */ /* Some details preceding the binary serialized measurement list */ struct ima_kexec_hdr { u16 version; u16 _reserved0; u32 _reserved1; u64 buffer_size; u64 count; }; /* IMA iint action cache flags */ #define IMA_MEASURE 0x00000001 #define IMA_MEASURED 0x00000002 #define IMA_APPRAISE 0x00000004 #define IMA_APPRAISED 0x00000008 /*#define IMA_COLLECT 0x00000010 do not use this flag */ #define IMA_COLLECTED 0x00000020 #define IMA_AUDIT 0x00000040 #define IMA_AUDITED 0x00000080 #define IMA_HASH 0x00000100 #define IMA_HASHED 0x00000200 /* IMA iint policy rule cache flags */ #define IMA_NONACTION_FLAGS 0xff000000 #define IMA_DIGSIG_REQUIRED 0x01000000 #define IMA_PERMIT_DIRECTIO 0x02000000 #define IMA_NEW_FILE 0x04000000 #define IMA_FAIL_UNVERIFIABLE_SIGS 0x10000000 #define IMA_MODSIG_ALLOWED 0x20000000 #define IMA_CHECK_BLACKLIST 0x40000000 #define IMA_VERITY_REQUIRED 0x80000000 #define IMA_DO_MASK (IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \ IMA_HASH | IMA_APPRAISE_SUBMASK) #define IMA_DONE_MASK (IMA_MEASURED | IMA_APPRAISED | IMA_AUDITED | \ IMA_HASHED | IMA_COLLECTED | \ IMA_APPRAISED_SUBMASK) /* IMA iint subaction appraise cache flags */ #define IMA_FILE_APPRAISE 0x00001000 #define IMA_FILE_APPRAISED 0x00002000 #define IMA_MMAP_APPRAISE 0x00004000 #define IMA_MMAP_APPRAISED 0x00008000 #define IMA_BPRM_APPRAISE 0x00010000 #define IMA_BPRM_APPRAISED 0x00020000 #define IMA_READ_APPRAISE 0x00040000 #define IMA_READ_APPRAISED 0x00080000 #define IMA_CREDS_APPRAISE 0x00100000 #define IMA_CREDS_APPRAISED 0x00200000 #define IMA_APPRAISE_SUBMASK (IMA_FILE_APPRAISE | IMA_MMAP_APPRAISE | \ IMA_BPRM_APPRAISE | IMA_READ_APPRAISE | \ IMA_CREDS_APPRAISE) #define IMA_APPRAISED_SUBMASK (IMA_FILE_APPRAISED | IMA_MMAP_APPRAISED | \ IMA_BPRM_APPRAISED | IMA_READ_APPRAISED | \ IMA_CREDS_APPRAISED) /* IMA iint cache atomic_flags */ #define IMA_CHANGE_XATTR 0 #define IMA_UPDATE_XATTR 1 #define IMA_CHANGE_ATTR 2 #define IMA_DIGSIG 3 #define IMA_MUST_MEASURE 4 /* IMA integrity metadata associated with an inode */ struct ima_iint_cache { struct mutex mutex; /* protects: version, flags, digest */ struct integrity_inode_attributes real_inode; unsigned long flags; unsigned long measured_pcrs; unsigned long atomic_flags; enum integrity_status ima_file_status:4; enum integrity_status ima_mmap_status:4; enum integrity_status ima_bprm_status:4; enum integrity_status ima_read_status:4; enum integrity_status ima_creds_status:4; struct ima_digest_data *ima_hash; }; extern struct lsm_blob_sizes ima_blob_sizes; static inline struct ima_iint_cache * ima_inode_get_iint(const struct inode *inode) { struct ima_iint_cache **iint_sec; if (unlikely(!inode->i_security)) return NULL; iint_sec = inode->i_security + ima_blob_sizes.lbs_inode; return *iint_sec; } static inline void ima_inode_set_iint(const struct inode *inode, struct ima_iint_cache *iint) { struct ima_iint_cache **iint_sec; if (unlikely(!inode->i_security)) return; iint_sec = inode->i_security + ima_blob_sizes.lbs_inode; *iint_sec = iint; } struct ima_iint_cache *ima_iint_find(struct inode *inode); struct ima_iint_cache *ima_inode_get(struct inode *inode); void ima_inode_free_rcu(void *inode_security); void __init ima_iintcache_init(void); extern const int read_idmap[]; #ifdef CONFIG_HAVE_IMA_KEXEC void ima_load_kexec_buffer(void); #else static inline void ima_load_kexec_buffer(void) {} #endif /* CONFIG_HAVE_IMA_KEXEC */ #ifdef CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS void ima_post_key_create_or_update(struct key *keyring, struct key *key, const void *payload, size_t plen, unsigned long flags, bool create); #endif /* * The default binary_runtime_measurements list format is defined as the * platform native format. The canonical format is defined as little-endian. */ extern bool ima_canonical_fmt; /* Internal IMA function definitions */ int ima_init(void); int ima_fs_init(void); int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename); int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash); int ima_calc_buffer_hash(const void *buf, loff_t len, struct ima_digest_data *hash); int ima_calc_field_array_hash(struct ima_field_data *field_data, struct ima_template_entry *entry); int ima_calc_boot_aggregate(struct ima_digest_data *hash); void ima_add_violation(struct file *file, const unsigned char *filename, struct ima_iint_cache *iint, const char *op, const char *cause); int ima_init_crypto(void); void ima_putc(struct seq_file *m, void *data, int datalen); void ima_print_digest(struct seq_file *m, u8 *digest, u32 size); int template_desc_init_fields(const char *template_fmt, const struct ima_template_field ***fields, int *num_fields); struct ima_template_desc *ima_template_desc_current(void); struct ima_template_desc *ima_template_desc_buf(void); struct ima_template_desc *lookup_template_desc(const char *name); bool ima_template_has_modsig(const struct ima_template_desc *ima_template); int ima_restore_measurement_entry(struct ima_template_entry *entry); int ima_restore_measurement_list(loff_t bufsize, void *buf); int ima_measurements_show(struct seq_file *m, void *v); unsigned long ima_get_binary_runtime_size(void); int ima_init_template(void); void ima_init_template_list(void); int __init ima_init_digests(void); void __init ima_init_reboot_notifier(void); int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event, void *lsm_data); /* * used to protect h_table and sha_table */ extern spinlock_t ima_queue_lock; struct ima_h_table { atomic_long_t len; /* number of stored measurements in the list */ atomic_long_t violations; struct hlist_head queue[IMA_MEASURE_HTABLE_SIZE]; }; extern struct ima_h_table ima_htable; static inline unsigned int ima_hash_key(u8 *digest) { /* there is no point in taking a hash of part of a digest */ return (digest[0] | digest[1] << 8) % IMA_MEASURE_HTABLE_SIZE; } #define __ima_hooks(hook) \ hook(NONE, none) \ hook(FILE_CHECK, file) \ hook(MMAP_CHECK, mmap) \ hook(MMAP_CHECK_REQPROT, mmap_reqprot) \ hook(BPRM_CHECK, bprm) \ hook(CREDS_CHECK, creds) \ hook(POST_SETATTR, post_setattr) \ hook(MODULE_CHECK, module) \ hook(FIRMWARE_CHECK, firmware) \ hook(KEXEC_KERNEL_CHECK, kexec_kernel) \ hook(KEXEC_INITRAMFS_CHECK, kexec_initramfs) \ hook(POLICY_CHECK, policy) \ hook(KEXEC_CMDLINE, kexec_cmdline) \ hook(KEY_CHECK, key) \ hook(CRITICAL_DATA, critical_data) \ hook(SETXATTR_CHECK, setxattr_check) \ hook(MAX_CHECK, none) #define __ima_hook_enumify(ENUM, str) ENUM, #define __ima_stringify(arg) (#arg) #define __ima_hook_measuring_stringify(ENUM, str) \ (__ima_stringify(measuring_ ##str)), enum ima_hooks { __ima_hooks(__ima_hook_enumify) }; static const char * const ima_hooks_measure_str[] = { __ima_hooks(__ima_hook_measuring_stringify) }; static inline const char *func_measure_str(enum ima_hooks func) { if (func >= MAX_CHECK) return ima_hooks_measure_str[NONE]; return ima_hooks_measure_str[func]; } extern const char *const func_tokens[]; struct modsig; #ifdef CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS /* * To track keys that need to be measured. */ struct ima_key_entry { struct list_head list; void *payload; size_t payload_len; char *keyring_name; }; void ima_init_key_queue(void); bool ima_should_queue_key(void); bool ima_queue_key(struct key *keyring, const void *payload, size_t payload_len); void ima_process_queued_keys(void); #else static inline void ima_init_key_queue(void) {} static inline bool ima_should_queue_key(void) { return false; } static inline bool ima_queue_key(struct key *keyring, const void *payload, size_t payload_len) { return false; } static inline void ima_process_queued_keys(void) {} #endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */ /* LIM API function definitions */ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, struct lsm_prop *prop, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos); int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func); int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file, void *buf, loff_t size, enum hash_algo algo, struct modsig *modsig); void ima_store_measurement(struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc); int process_buffer_measurement(struct mnt_idmap *idmap, struct inode *inode, const void *buf, int size, const char *eventname, enum ima_hooks func, int pcr, const char *func_data, bool buf_hash, u8 *digest, size_t digest_len); void ima_audit_measurement(struct ima_iint_cache *iint, const unsigned char *filename); int ima_alloc_init_template(struct ima_event_data *event_data, struct ima_template_entry **entry, struct ima_template_desc *template_desc); int ima_store_template(struct ima_template_entry *entry, int violation, struct inode *inode, const unsigned char *filename, int pcr); void ima_free_template_entry(struct ima_template_entry *entry); const char *ima_d_path(const struct path *path, char **pathbuf, char *filename); /* IMA policy related functions */ int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, struct lsm_prop *prop, enum ima_hooks func, int mask, int flags, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos); void ima_init_policy(void); void ima_update_policy(void); void ima_update_policy_flags(void); ssize_t ima_parse_add_rule(char *); void ima_delete_rules(void); int ima_check_policy(void); void *ima_policy_start(struct seq_file *m, loff_t *pos); void *ima_policy_next(struct seq_file *m, void *v, loff_t *pos); void ima_policy_stop(struct seq_file *m, void *v); int ima_policy_show(struct seq_file *m, void *v); /* Appraise integrity measurements */ #define IMA_APPRAISE_ENFORCE 0x01 #define IMA_APPRAISE_FIX 0x02 #define IMA_APPRAISE_LOG 0x04 #define IMA_APPRAISE_MODULES 0x08 #define IMA_APPRAISE_FIRMWARE 0x10 #define IMA_APPRAISE_POLICY 0x20 #define IMA_APPRAISE_KEXEC 0x40 #ifdef CONFIG_IMA_APPRAISE int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr); int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig); int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func); void ima_update_xattr(struct ima_iint_cache *iint, struct file *file); enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func); enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len); int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len); void __init init_ima_appraise_lsm(const struct lsm_id *lsmid); #else static inline int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { return 0; } static inline int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig) { return INTEGRITY_UNKNOWN; } static inline int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { return 0; } static inline void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { } static inline enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { return INTEGRITY_UNKNOWN; } static inline enum hash_algo ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, int xattr_len) { return ima_hash_algo; } static inline int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { return 0; } static inline void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { } #endif /* CONFIG_IMA_APPRAISE */ #ifdef CONFIG_IMA_APPRAISE_MODSIG int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig); void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size); int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, const u8 **digest, u32 *digest_size); int ima_get_raw_modsig(const struct modsig *modsig, const void **data, u32 *data_len); void ima_free_modsig(struct modsig *modsig); #else static inline int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig) { return -EOPNOTSUPP; } static inline void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size) { } static inline int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, const u8 **digest, u32 *digest_size) { return -EOPNOTSUPP; } static inline int ima_get_raw_modsig(const struct modsig *modsig, const void **data, u32 *data_len) { return -EOPNOTSUPP; } static inline void ima_free_modsig(struct modsig *modsig) { } #endif /* CONFIG_IMA_APPRAISE_MODSIG */ /* LSM based policy rules require audit */ #ifdef CONFIG_IMA_LSM_RULES #define ima_filter_rule_init security_audit_rule_init #define ima_filter_rule_free security_audit_rule_free #define ima_filter_rule_match security_audit_rule_match #else static inline int ima_filter_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, gfp_t gfp) { return -EINVAL; } static inline void ima_filter_rule_free(void *lsmrule) { } static inline int ima_filter_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *lsmrule) { return -EINVAL; } #endif /* CONFIG_IMA_LSM_RULES */ #ifdef CONFIG_IMA_READ_POLICY #define POLICY_FILE_FLAGS (S_IWUSR | S_IRUSR) #else #define POLICY_FILE_FLAGS S_IWUSR #endif /* CONFIG_IMA_READ_POLICY */ #endif /* __LINUX_IMA_H */
31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2018-2024 Intel Corporation */ #ifndef IEEE80211_I_H #define IEEE80211_I_H #include <linux/kernel.h> #include <linux/device.h> #include <linux/if_ether.h> #include <linux/interrupt.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/workqueue.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/etherdevice.h> #include <linux/leds.h> #include <linux/idr.h> #include <linux/rhashtable.h> #include <linux/rbtree.h> #include <kunit/visibility.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> #include <net/mac80211.h> #include <net/fq.h> #include "key.h" #include "sta_info.h" #include "debug.h" #include "drop.h" extern const struct cfg80211_ops mac80211_config_ops; struct ieee80211_local; struct ieee80211_mesh_fast_tx; /* Maximum number of broadcast/multicast frames to buffer when some of the * associated stations are using power saving. */ #define AP_MAX_BC_BUFFER 128 /* Maximum number of frames buffered to all STAs, including multicast frames. * Note: increasing this limit increases the potential memory requirement. Each * frame can be up to about 2 kB long. */ #define TOTAL_MAX_TX_BUFFER 512 /* Required encryption head and tailroom */ #define IEEE80211_ENCRYPT_HEADROOM 8 #define IEEE80211_ENCRYPT_TAILROOM 18 /* power level hasn't been configured (or set to automatic) */ #define IEEE80211_UNSET_POWER_LEVEL INT_MIN /* * Some APs experience problems when working with U-APSD. Decreasing the * probability of that happening by using legacy mode for all ACs but VO isn't * enough. * * Cisco 4410N originally forced us to enable VO by default only because it * treated non-VO ACs as legacy. * * However some APs (notably Netgear R7000) silently reclassify packets to * different ACs. Since u-APSD ACs require trigger frames for frame retrieval * clients would never see some frames (e.g. ARP responses) or would fetch them * accidentally after a long time. * * It makes little sense to enable u-APSD queues by default because it needs * userspace applications to be aware of it to actually take advantage of the * possible additional powersavings. Implicitly depending on driver autotrigger * frame support doesn't make much sense. */ #define IEEE80211_DEFAULT_UAPSD_QUEUES 0 #define IEEE80211_DEFAULT_MAX_SP_LEN \ IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS]; #define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */) #define IEEE80211_MAX_NAN_INSTANCE_ID 255 enum ieee80211_status_data { IEEE80211_STATUS_TYPE_MASK = 0x00f, IEEE80211_STATUS_TYPE_INVALID = 0, IEEE80211_STATUS_TYPE_SMPS = 1, IEEE80211_STATUS_TYPE_NEG_TTLM = 2, IEEE80211_STATUS_SUBDATA_MASK = 0x1ff0, }; static inline bool ieee80211_sta_keep_active(struct sta_info *sta, u8 ac) { /* Keep a station's queues on the active list for deficit accounting * purposes if it was active or queued during the last 100ms. */ return time_before_eq(jiffies, sta->airtime[ac].last_active + HZ / 10); } struct ieee80211_bss { u32 device_ts_beacon, device_ts_presp; bool wmm_used; bool uapsd_supported; #define IEEE80211_MAX_SUPP_RATES 32 u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; size_t supp_rates_len; struct ieee80211_rate *beacon_rate; u32 vht_cap_info; /* * During association, we save an ERP value from a probe response so * that we can feed ERP info to the driver when handling the * association completes. these fields probably won't be up-to-date * otherwise, you probably don't want to use them. */ bool has_erp_value; u8 erp_value; /* Keep track of the corruption of the last beacon/probe response. */ u8 corrupt_data; /* Keep track of what bits of information we have valid info for. */ u8 valid_data; }; /** * enum ieee80211_bss_corrupt_data_flags - BSS data corruption flags * @IEEE80211_BSS_CORRUPT_BEACON: last beacon frame received was corrupted * @IEEE80211_BSS_CORRUPT_PROBE_RESP: last probe response received was corrupted * * These are bss flags that are attached to a bss in the * @corrupt_data field of &struct ieee80211_bss. */ enum ieee80211_bss_corrupt_data_flags { IEEE80211_BSS_CORRUPT_BEACON = BIT(0), IEEE80211_BSS_CORRUPT_PROBE_RESP = BIT(1) }; /** * enum ieee80211_bss_valid_data_flags - BSS valid data flags * @IEEE80211_BSS_VALID_WMM: WMM/UAPSD data was gathered from non-corrupt IE * @IEEE80211_BSS_VALID_RATES: Supported rates were gathered from non-corrupt IE * @IEEE80211_BSS_VALID_ERP: ERP flag was gathered from non-corrupt IE * * These are bss flags that are attached to a bss in the * @valid_data field of &struct ieee80211_bss. They show which parts * of the data structure were received as a result of an un-corrupted * beacon/probe response. */ enum ieee80211_bss_valid_data_flags { IEEE80211_BSS_VALID_WMM = BIT(1), IEEE80211_BSS_VALID_RATES = BIT(2), IEEE80211_BSS_VALID_ERP = BIT(3) }; typedef unsigned __bitwise ieee80211_tx_result; #define TX_CONTINUE ((__force ieee80211_tx_result) 0u) #define TX_DROP ((__force ieee80211_tx_result) 1u) #define TX_QUEUED ((__force ieee80211_tx_result) 2u) #define IEEE80211_TX_UNICAST BIT(1) #define IEEE80211_TX_PS_BUFFERED BIT(2) struct ieee80211_tx_data { struct sk_buff *skb; struct sk_buff_head skbs; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; struct sta_info *sta; struct ieee80211_key *key; struct ieee80211_tx_rate rate; unsigned int flags; }; /** * enum ieee80211_packet_rx_flags - packet RX flags * @IEEE80211_RX_AMSDU: a-MSDU packet * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed * @IEEE80211_RX_DEFERRED_RELEASE: frame was subjected to receive reordering * * These are per-frame flags that are attached to a frame in the * @rx_flags field of &struct ieee80211_rx_status. */ enum ieee80211_packet_rx_flags { IEEE80211_RX_AMSDU = BIT(3), IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4), IEEE80211_RX_DEFERRED_RELEASE = BIT(5), }; /** * enum ieee80211_rx_flags - RX data flags * * @IEEE80211_RX_CMNTR: received on cooked monitor already * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported * to cfg80211_report_obss_beacon(). * * These flags are used across handling multiple interfaces * for a single frame. */ enum ieee80211_rx_flags { IEEE80211_RX_CMNTR = BIT(0), IEEE80211_RX_BEACON_REPORTED = BIT(1), }; struct ieee80211_rx_data { struct list_head *list; struct sk_buff *skb; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; struct ieee80211_link_data *link; struct sta_info *sta; struct link_sta_info *link_sta; struct ieee80211_key *key; unsigned int flags; /* * Index into sequence numbers array, 0..16 * since the last (16) is used for non-QoS, * will be 16 on non-QoS frames. */ int seqno_idx; /* * Index into the security IV/PN arrays, 0..16 * since the last (16) is used for CCMP-encrypted * management frames, will be set to 16 on mgmt * frames and 0 on non-QoS frames. */ int security_idx; int link_id; union { struct { u32 iv32; u16 iv16; } tkip; struct { u8 pn[IEEE80211_CCMP_PN_LEN]; } ccm_gcm; }; }; struct ieee80211_csa_settings { const u16 *counter_offsets_beacon; const u16 *counter_offsets_presp; int n_counter_offsets_beacon; int n_counter_offsets_presp; u8 count; }; struct ieee80211_color_change_settings { u16 counter_offset_beacon; u16 counter_offset_presp; u8 count; }; struct beacon_data { u8 *head, *tail; int head_len, tail_len; struct ieee80211_meshconf_ie *meshconf; u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; u8 cntdwn_current_counter; struct cfg80211_mbssid_elems *mbssid_ies; struct cfg80211_rnr_elems *rnr_ies; struct rcu_head rcu_head; }; struct probe_resp { struct rcu_head rcu_head; int len; u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; u8 data[]; }; struct fils_discovery_data { struct rcu_head rcu_head; int len; u8 data[]; }; struct unsol_bcast_probe_resp_data { struct rcu_head rcu_head; int len; u8 data[]; }; struct ps_data { /* yes, this looks ugly, but guarantees that we can later use * bitmap_empty :) * NB: don't touch this bitmap, use sta_info_{set,clear}_tim_bit */ u8 tim[sizeof(unsigned long) * BITS_TO_LONGS(IEEE80211_MAX_AID + 1)] __aligned(__alignof__(unsigned long)); struct sk_buff_head bc_buf; atomic_t num_sta_ps; /* number of stations in PS mode */ int dtim_count; bool dtim_bc_mc; }; struct ieee80211_if_ap { struct list_head vlans; /* write-protected with RTNL and local->mtx */ struct ps_data ps; atomic_t num_mcast_sta; /* number of stations receiving multicast */ bool multicast_to_unicast; bool active; }; struct ieee80211_if_vlan { struct list_head list; /* write-protected with RTNL and local->mtx */ /* used for all tx if the VLAN is configured to 4-addr mode */ struct sta_info __rcu *sta; atomic_t num_mcast_sta; /* number of stations receiving multicast */ }; struct mesh_stats { __u32 fwded_mcast; /* Mesh forwarded multicast frames */ __u32 fwded_unicast; /* Mesh forwarded unicast frames */ __u32 fwded_frames; /* Mesh total forwarded frames */ __u32 dropped_frames_ttl; /* Not transmitted since mesh_ttl == 0*/ __u32 dropped_frames_no_route; /* Not transmitted, no route found */ }; #define PREQ_Q_F_START 0x1 #define PREQ_Q_F_REFRESH 0x2 struct mesh_preq_queue { struct list_head list; u8 dst[ETH_ALEN]; u8 flags; }; struct ieee80211_roc_work { struct list_head list; struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *chan; bool started, abort, hw_begun, notified; bool on_channel; unsigned long start_time; u32 duration, req_duration; struct sk_buff *frame; u64 cookie, mgmt_tx_cookie; enum ieee80211_roc_type type; }; /* flags used in struct ieee80211_if_managed.flags */ enum ieee80211_sta_flags { IEEE80211_STA_CONNECTION_POLL = BIT(1), IEEE80211_STA_CONTROL_PORT = BIT(2), IEEE80211_STA_MFP_ENABLED = BIT(6), IEEE80211_STA_UAPSD_ENABLED = BIT(7), IEEE80211_STA_NULLFUNC_ACKED = BIT(8), IEEE80211_STA_ENABLE_RRM = BIT(15), }; enum ieee80211_conn_mode { IEEE80211_CONN_MODE_S1G, IEEE80211_CONN_MODE_LEGACY, IEEE80211_CONN_MODE_HT, IEEE80211_CONN_MODE_VHT, IEEE80211_CONN_MODE_HE, IEEE80211_CONN_MODE_EHT, }; #define IEEE80211_CONN_MODE_HIGHEST IEEE80211_CONN_MODE_EHT enum ieee80211_conn_bw_limit { IEEE80211_CONN_BW_LIMIT_20, IEEE80211_CONN_BW_LIMIT_40, IEEE80211_CONN_BW_LIMIT_80, IEEE80211_CONN_BW_LIMIT_160, /* also 80+80 */ IEEE80211_CONN_BW_LIMIT_320, }; struct ieee80211_conn_settings { enum ieee80211_conn_mode mode; enum ieee80211_conn_bw_limit bw_limit; }; extern const struct ieee80211_conn_settings ieee80211_conn_settings_unlimited; struct ieee80211_mgd_auth_data { struct cfg80211_bss *bss; unsigned long timeout; int tries; u16 algorithm, expected_transaction; unsigned long userspace_selectors[BITS_TO_LONGS(128)]; u8 key[WLAN_KEY_LEN_WEP104]; u8 key_len, key_idx; bool done, waiting; bool peer_confirmed; bool timeout_started; int link_id; u8 ap_addr[ETH_ALEN] __aligned(2); u16 sae_trans, sae_status; size_t data_len; u8 data[]; }; struct ieee80211_mgd_assoc_data { struct { struct cfg80211_bss *bss; u8 addr[ETH_ALEN] __aligned(2); u8 ap_ht_param; struct ieee80211_vht_cap ap_vht_cap; size_t elems_len; u8 *elems; /* pointing to inside ie[] below */ struct ieee80211_conn_settings conn; u16 status; bool disabled; } link[IEEE80211_MLD_MAX_NUM_LINKS]; u8 ap_addr[ETH_ALEN] __aligned(2); /* this is for a workaround, so we use it only for non-MLO */ const u8 *supp_rates; u8 supp_rates_len; unsigned long userspace_selectors[BITS_TO_LONGS(128)]; unsigned long timeout; int tries; u8 prev_ap_addr[ETH_ALEN]; u8 ssid[IEEE80211_MAX_SSID_LEN]; u8 ssid_len; bool wmm, uapsd; bool need_beacon; bool synced; bool timeout_started; bool comeback; /* whether the AP has requested association comeback */ bool s1g; bool spp_amsdu; unsigned int assoc_link_id; u8 fils_nonces[2 * FILS_NONCE_LEN]; u8 fils_kek[FILS_MAX_KEK_LEN]; size_t fils_kek_len; size_t ie_len; u8 *ie_pos; /* used to fill ie[] with link[].elems */ u8 ie[]; }; struct ieee80211_sta_tx_tspec { /* timestamp of the first packet in the time slice */ unsigned long time_slice_start; u32 admitted_time; /* in usecs, unlike over the air */ u8 tsid; s8 up; /* signed to be able to invalidate with -1 during teardown */ /* consumed TX time in microseconds in the time slice */ u32 consumed_tx_time; enum { TX_TSPEC_ACTION_NONE = 0, TX_TSPEC_ACTION_DOWNGRADE, TX_TSPEC_ACTION_STOP_DOWNGRADE, } action; bool downgraded; }; /* Advertised TID-to-link mapping info */ struct ieee80211_adv_ttlm_info { /* time in TUs at which the new mapping is established, or 0 if there is * no planned advertised TID-to-link mapping */ u16 switch_time; u32 duration; /* duration of the planned T2L map in TUs */ u16 map; /* map of usable links for all TIDs */ bool active; /* whether the advertised mapping is active or not */ }; DECLARE_EWMA(beacon_signal, 4, 4) struct ieee80211_if_managed { struct timer_list timer; struct timer_list conn_mon_timer; struct timer_list bcn_mon_timer; struct wiphy_work monitor_work; struct wiphy_work beacon_connection_loss_work; struct wiphy_work csa_connection_drop_work; unsigned long beacon_timeout; unsigned long probe_timeout; int probe_send_count; bool nullfunc_failed; u8 connection_loss:1, driver_disconnect:1, reconnect:1, associated:1; struct ieee80211_mgd_auth_data *auth_data; struct ieee80211_mgd_assoc_data *assoc_data; bool powersave; /* powersave requested for this iface */ bool broken_ap; /* AP is broken -- turn off powersave */ unsigned int flags; u16 mcast_seq_last; bool status_acked; bool status_received; __le16 status_fc; enum { IEEE80211_MFP_DISABLED, IEEE80211_MFP_OPTIONAL, IEEE80211_MFP_REQUIRED } mfp; /* management frame protection */ /* * Bitmask of enabled u-apsd queues, * IEEE80211_WMM_IE_STA_QOSINFO_AC_BE & co. Needs a new association * to take effect. */ unsigned int uapsd_queues; /* * Maximum number of buffered frames AP can deliver during a * service period, IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL or similar. * Needs a new association to take effect. */ unsigned int uapsd_max_sp_len; u8 use_4addr; /* * State variables for keeping track of RSSI of the AP currently * connected to and informing driver when RSSI has gone * below/above a certain threshold. */ int rssi_min_thold, rssi_max_thold; struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */ struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */ struct ieee80211_vht_cap vht_capa; /* configured VHT overrides */ struct ieee80211_vht_cap vht_capa_mask; /* Valid parts of vht_capa */ struct ieee80211_s1g_cap s1g_capa; /* configured S1G overrides */ struct ieee80211_s1g_cap s1g_capa_mask; /* valid s1g_capa bits */ /* TDLS support */ u8 tdls_peer[ETH_ALEN] __aligned(2); struct wiphy_delayed_work tdls_peer_del_work; struct sk_buff *orig_teardown_skb; /* The original teardown skb */ struct sk_buff *teardown_skb; /* A copy to send through the AP */ spinlock_t teardown_lock; /* To lock changing teardown_skb */ bool tdls_wider_bw_prohibited; /* WMM-AC TSPEC support */ struct ieee80211_sta_tx_tspec tx_tspec[IEEE80211_NUM_ACS]; /* Use a separate work struct so that we can do something here * while the sdata->work is flushing the queues, for example. * otherwise, in scenarios where we hardly get any traffic out * on the BE queue, but there's a lot of VO traffic, we might * get stuck in a downgraded situation and flush takes forever. */ struct wiphy_delayed_work tx_tspec_wk; /* Information elements from the last transmitted (Re)Association * Request frame. */ u8 *assoc_req_ies; size_t assoc_req_ies_len; struct wiphy_delayed_work ml_reconf_work; u16 removed_links; /* TID-to-link mapping support */ struct wiphy_delayed_work ttlm_work; struct ieee80211_adv_ttlm_info ttlm_info; struct wiphy_work teardown_ttlm_work; /* dialog token enumerator for neg TTLM request */ u8 dialog_token_alloc; struct wiphy_delayed_work neg_ttlm_timeout_work; /* Locally initiated multi-link reconfiguration */ struct { struct ieee80211_mgd_assoc_data *add_links_data; struct wiphy_delayed_work wk; u16 removed_links; u16 added_links; u8 dialog_token; } reconf; }; struct ieee80211_if_ibss { struct timer_list timer; struct wiphy_work csa_connection_drop_work; unsigned long last_scan_completed; u32 basic_rates; bool fixed_bssid; bool fixed_channel; bool privacy; bool control_port; bool userspace_handles_dfs; u8 bssid[ETH_ALEN] __aligned(2); u8 ssid[IEEE80211_MAX_SSID_LEN]; u8 ssid_len, ie_len; u8 *ie; struct cfg80211_chan_def chandef; unsigned long ibss_join_req; /* probe response/beacon for IBSS */ struct beacon_data __rcu *presp; struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */ struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */ spinlock_t incomplete_lock; struct list_head incomplete_stations; enum { IEEE80211_IBSS_MLME_SEARCH, IEEE80211_IBSS_MLME_JOINED, } state; }; /** * struct ieee80211_if_ocb - OCB mode state * * @housekeeping_timer: timer for periodic invocation of a housekeeping task * @wrkq_flags: OCB deferred task action * @incomplete_lock: delayed STA insertion lock * @incomplete_stations: list of STAs waiting for delayed insertion * @joined: indication if the interface is connected to an OCB network */ struct ieee80211_if_ocb { struct timer_list housekeeping_timer; unsigned long wrkq_flags; spinlock_t incomplete_lock; struct list_head incomplete_stations; bool joined; }; /** * struct ieee80211_mesh_sync_ops - Extensible synchronization framework interface * * these declarations define the interface, which enables * vendor-specific mesh synchronization * * @rx_bcn_presp: beacon/probe response was received * @adjust_tsf: TSF adjustment method */ struct ieee80211_mesh_sync_ops { void (*rx_bcn_presp)(struct ieee80211_sub_if_data *sdata, u16 stype, struct ieee80211_mgmt *mgmt, unsigned int len, const struct ieee80211_meshconf_ie *mesh_cfg, struct ieee80211_rx_status *rx_status); /* should be called with beacon_data under RCU read lock */ void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata, struct beacon_data *beacon); /* add other framework functions here */ }; struct mesh_csa_settings { struct rcu_head rcu_head; struct cfg80211_csa_settings settings; }; /** * struct mesh_table - mesh hash table * * @known_gates: list of known mesh gates and their mpaths by the station. The * gate's mpath may or may not be resolved and active. * @gates_lock: protects updates to known_gates * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr * @walk_head: linked list containing all mesh_path objects * @walk_lock: lock protecting walk_head * @entries: number of entries in the table */ struct mesh_table { struct hlist_head known_gates; spinlock_t gates_lock; struct rhashtable rhead; struct hlist_head walk_head; spinlock_t walk_lock; atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */ }; /** * struct mesh_tx_cache - mesh fast xmit header cache * * @rht: hash table containing struct ieee80211_mesh_fast_tx, using skb DA as key * @walk_head: linked list containing all ieee80211_mesh_fast_tx objects * @walk_lock: lock protecting walk_head and rht */ struct mesh_tx_cache { struct rhashtable rht; struct hlist_head walk_head; spinlock_t walk_lock; }; struct ieee80211_if_mesh { struct timer_list housekeeping_timer; struct timer_list mesh_path_timer; struct timer_list mesh_path_root_timer; unsigned long wrkq_flags; unsigned long mbss_changed[64 / BITS_PER_LONG]; bool userspace_handles_dfs; u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN]; size_t mesh_id_len; /* Active Path Selection Protocol Identifier */ u8 mesh_pp_id; /* Active Path Selection Metric Identifier */ u8 mesh_pm_id; /* Congestion Control Mode Identifier */ u8 mesh_cc_id; /* Synchronization Protocol Identifier */ u8 mesh_sp_id; /* Authentication Protocol Identifier */ u8 mesh_auth_id; /* Local mesh Sequence Number */ u32 sn; /* Last used PREQ ID */ u32 preq_id; atomic_t mpaths; /* Timestamp of last SN update */ unsigned long last_sn_update; /* Time when it's ok to send next PERR */ unsigned long next_perr; /* Timestamp of last PREQ sent */ unsigned long last_preq; struct mesh_rmc *rmc; spinlock_t mesh_preq_queue_lock; struct mesh_preq_queue preq_queue; int preq_queue_len; struct mesh_stats mshstats; struct mesh_config mshcfg; atomic_t estab_plinks; atomic_t mesh_seqnum; bool accepting_plinks; int num_gates; struct beacon_data __rcu *beacon; const u8 *ie; u8 ie_len; enum { IEEE80211_MESH_SEC_NONE = 0x0, IEEE80211_MESH_SEC_AUTHED = 0x1, IEEE80211_MESH_SEC_SECURED = 0x2, } security; bool user_mpm; /* Extensible Synchronization Framework */ const struct ieee80211_mesh_sync_ops *sync_ops; s64 sync_offset_clockdrift_max; spinlock_t sync_offset_lock; /* mesh power save */ enum nl80211_mesh_power_mode nonpeer_pm; int ps_peers_light_sleep; int ps_peers_deep_sleep; struct ps_data ps; /* Channel Switching Support */ struct mesh_csa_settings __rcu *csa; enum { IEEE80211_MESH_CSA_ROLE_NONE, IEEE80211_MESH_CSA_ROLE_INIT, IEEE80211_MESH_CSA_ROLE_REPEATER, } csa_role; u8 chsw_ttl; u16 pre_value; /* offset from skb->data while building IE */ int meshconf_offset; struct mesh_table mesh_paths; struct mesh_table mpp_paths; /* Store paths for MPP&MAP */ int mesh_paths_generation; int mpp_paths_generation; struct mesh_tx_cache tx_cache; }; #ifdef CONFIG_MAC80211_MESH #define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \ do { (msh)->mshstats.name++; } while (0) #else #define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \ do { } while (0) #endif /** * enum ieee80211_sub_if_data_flags - virtual interface flags * * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between * associated stations and deliver multicast frames both * back to wireless media and to the local net stack. * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume. * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver * @IEEE80211_SDATA_DISCONNECT_HW_RESTART: Disconnect after hardware restart * recovery */ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_ALLMULTI = BIT(0), IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4), IEEE80211_SDATA_IN_DRIVER = BIT(5), IEEE80211_SDATA_DISCONNECT_HW_RESTART = BIT(6), }; /** * enum ieee80211_sdata_state_bits - virtual interface state bits * @SDATA_STATE_RUNNING: virtual interface is up & running; this * mirrors netif_running() but is separate for interface type * change handling while the interface is up * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel * mode, so queues are stopped * @SDATA_STATE_OFFCHANNEL_BEACON_STOPPED: Beaconing was stopped due * to offchannel, reset when offchannel returns */ enum ieee80211_sdata_state_bits { SDATA_STATE_RUNNING, SDATA_STATE_OFFCHANNEL, SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, }; /** * enum ieee80211_chanctx_mode - channel context configuration mode * * @IEEE80211_CHANCTX_SHARED: channel context may be used by * multiple interfaces * @IEEE80211_CHANCTX_EXCLUSIVE: channel context can be used * only by a single interface. This can be used for example for * non-fixed channel IBSS. */ enum ieee80211_chanctx_mode { IEEE80211_CHANCTX_SHARED, IEEE80211_CHANCTX_EXCLUSIVE }; /** * enum ieee80211_chanctx_replace_state - channel context replacement state * * This is used for channel context in-place reservations that require channel * context switch/swap. * * @IEEE80211_CHANCTX_REPLACE_NONE: no replacement is taking place * @IEEE80211_CHANCTX_WILL_BE_REPLACED: this channel context will be replaced * by a (not yet registered) channel context pointed by %replace_ctx. * @IEEE80211_CHANCTX_REPLACES_OTHER: this (not yet registered) channel context * replaces an existing channel context pointed to by %replace_ctx. */ enum ieee80211_chanctx_replace_state { IEEE80211_CHANCTX_REPLACE_NONE, IEEE80211_CHANCTX_WILL_BE_REPLACED, IEEE80211_CHANCTX_REPLACES_OTHER, }; struct ieee80211_chanctx { struct list_head list; struct rcu_head rcu_head; struct list_head assigned_links; struct list_head reserved_links; enum ieee80211_chanctx_replace_state replace_state; struct ieee80211_chanctx *replace_ctx; enum ieee80211_chanctx_mode mode; bool driver_present; /* temporary data for search algorithm etc. */ struct ieee80211_chan_req req; bool radar_detected; /* MUST be last - ends in a flexible-array member. */ struct ieee80211_chanctx_conf conf; }; struct mac80211_qos_map { struct cfg80211_qos_map qos_map; struct rcu_head rcu_head; }; enum txq_info_flags { IEEE80211_TXQ_STOP, IEEE80211_TXQ_AMPDU, IEEE80211_TXQ_NO_AMSDU, IEEE80211_TXQ_DIRTY, }; /** * struct txq_info - per tid queue * * @tin: contains packets split into multiple flows * @def_cvars: codel vars for the @tin's default_flow * @cstats: code statistics for this queue * @frags: used to keep fragments created after dequeue * @schedule_order: used with ieee80211_local->active_txqs * @schedule_round: counter to prevent infinite loops on TXQ scheduling * @flags: TXQ flags from &enum txq_info_flags * @txq: the driver visible part */ struct txq_info { struct fq_tin tin; struct codel_vars def_cvars; struct codel_stats cstats; u16 schedule_round; struct list_head schedule_order; struct sk_buff_head frags; unsigned long flags; /* keep last! */ struct ieee80211_txq txq; }; struct ieee80211_if_mntr { u32 flags; u8 mu_follow_addr[ETH_ALEN] __aligned(2); struct list_head list; }; /** * struct ieee80211_if_nan - NAN state * * @conf: current NAN configuration * @func_lock: lock for @func_inst_ids * @function_inst_ids: a bitmap of available instance_id's */ struct ieee80211_if_nan { struct cfg80211_nan_conf conf; /* protects function_inst_ids */ spinlock_t func_lock; struct idr function_inst_ids; }; struct ieee80211_link_data_managed { u8 bssid[ETH_ALEN] __aligned(2); u8 dtim_period; enum ieee80211_smps_mode req_smps, /* requested smps mode */ driver_smps_mode; /* smps mode request */ struct ieee80211_conn_settings conn; s16 p2p_noa_index; bool tdls_chan_switch_prohibited; bool have_beacon; bool tracking_signal_avg; bool disable_wmm_tracking; bool operating_11g_mode; struct { struct wiphy_delayed_work switch_work; struct cfg80211_chan_def ap_chandef; struct ieee80211_parsed_tpe tpe; unsigned long time; bool waiting_bcn; bool ignored_same_chan; bool blocked_tx; } csa; struct wiphy_work request_smps_work; /* used to reconfigure hardware SM PS */ struct wiphy_work recalc_smps; bool beacon_crc_valid; u32 beacon_crc; struct ewma_beacon_signal ave_beacon_signal; int last_ave_beacon_signal; /* * Number of Beacon frames used in ave_beacon_signal. This can be used * to avoid generating less reliable cqm events that would be based * only on couple of received frames. */ unsigned int count_beacon_signal; /* Number of times beacon loss was invoked. */ unsigned int beacon_loss_count; /* * Last Beacon frame signal strength average (ave_beacon_signal / 16) * that triggered a cqm event. 0 indicates that no event has been * generated for the current association. */ int last_cqm_event_signal; int wmm_last_param_set; int mu_edca_last_param_set; }; struct ieee80211_link_data_ap { struct beacon_data __rcu *beacon; struct probe_resp __rcu *probe_resp; struct fils_discovery_data __rcu *fils_discovery; struct unsol_bcast_probe_resp_data __rcu *unsol_bcast_probe_resp; /* to be used after channel switch. */ struct cfg80211_beacon_data *next_beacon; }; struct ieee80211_link_data { struct ieee80211_sub_if_data *sdata; unsigned int link_id; struct list_head assigned_chanctx_list; /* protected by wiphy mutex */ struct list_head reserved_chanctx_list; /* protected by wiphy mutex */ /* multicast keys only */ struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS]; struct ieee80211_key __rcu *default_multicast_key; struct ieee80211_key __rcu *default_mgmt_key; struct ieee80211_key __rcu *default_beacon_key; bool operating_11g_mode; struct { struct wiphy_work finalize_work; struct ieee80211_chan_req chanreq; } csa; struct wiphy_work color_change_finalize_work; struct wiphy_delayed_work color_collision_detect_work; u64 color_bitmap; /* context reservation -- protected with wiphy mutex */ struct ieee80211_chanctx *reserved_chanctx; struct ieee80211_chan_req reserved; bool reserved_radar_required; bool reserved_ready; u8 needed_rx_chains; enum ieee80211_smps_mode smps_mode; int user_power_level; /* in dBm */ int ap_power_level; /* in dBm */ bool radar_required; struct wiphy_delayed_work dfs_cac_timer_work; union { struct ieee80211_link_data_managed mgd; struct ieee80211_link_data_ap ap; } u; struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS]; struct ieee80211_bss_conf *conf; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; #endif }; struct ieee80211_sub_if_data { struct list_head list; struct wireless_dev wdev; /* keys */ struct list_head key_list; /* count for keys needing tailroom space allocation */ int crypto_tx_tailroom_needed_cnt; int crypto_tx_tailroom_pending_dec; struct wiphy_delayed_work dec_tailroom_needed_wk; struct net_device *dev; struct ieee80211_local *local; unsigned int flags; unsigned long state; char name[IFNAMSIZ]; struct ieee80211_fragment_cache frags; /* TID bitmap for NoAck policy */ u16 noack_map; /* bit field of ACM bits (BIT(802.1D tag)) */ u8 wmm_acm; struct ieee80211_key __rcu *keys[NUM_DEFAULT_KEYS]; struct ieee80211_key __rcu *default_unicast_key; u16 sequence_number; u16 mld_mcast_seq; __be16 control_port_protocol; bool control_port_no_encrypt; bool control_port_no_preauth; bool control_port_over_nl80211; atomic_t num_tx_queued; struct mac80211_qos_map __rcu *qos_map; struct wiphy_work work; struct sk_buff_head skb_queue; struct sk_buff_head status_queue; /* * AP this belongs to: self in AP mode and * corresponding AP in VLAN mode, NULL for * all others (might be needed later in IBSS) */ struct ieee80211_if_ap *bss; /* bitmap of allowed (non-MCS) rate indexes for rate control */ u32 rc_rateidx_mask[NUM_NL80211_BANDS]; bool rc_has_mcs_mask[NUM_NL80211_BANDS]; u8 rc_rateidx_mcs_mask[NUM_NL80211_BANDS][IEEE80211_HT_MCS_MASK_LEN]; bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS]; u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX]; /* Beacon frame (non-MCS) rate (as a bitmap) */ u32 beacon_rateidx_mask[NUM_NL80211_BANDS]; bool beacon_rate_set; union { struct ieee80211_if_ap ap; struct ieee80211_if_vlan vlan; struct ieee80211_if_managed mgd; struct ieee80211_if_ibss ibss; struct ieee80211_if_mesh mesh; struct ieee80211_if_ocb ocb; struct ieee80211_if_mntr mntr; struct ieee80211_if_nan nan; } u; struct ieee80211_link_data deflink; struct ieee80211_link_data __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; /* for ieee80211_set_active_links_async() */ struct wiphy_work activate_links_work; u16 desired_active_links; u16 restart_active_links; #ifdef CONFIG_MAC80211_DEBUGFS struct { struct dentry *subdir_stations; struct dentry *default_unicast_key; struct dentry *default_multicast_key; struct dentry *default_mgmt_key; struct dentry *default_beacon_key; } debugfs; #endif /* must be last, dynamically sized area in this! */ struct ieee80211_vif vif; }; static inline struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) { return container_of(p, struct ieee80211_sub_if_data, vif); } #define sdata_dereference(p, sdata) \ wiphy_dereference(sdata->local->hw.wiphy, p) #define for_each_sdata_link(_local, _link) \ /* outer loop just to define the variables ... */ \ for (struct ieee80211_sub_if_data *___sdata = NULL; \ !___sdata; \ ___sdata = (void *)~0 /* always stop */) \ list_for_each_entry(___sdata, &(_local)->interfaces, list) \ if (ieee80211_sdata_running(___sdata)) \ for (int ___link_id = 0; \ ___link_id < ARRAY_SIZE(___sdata->link); \ ___link_id++) \ if ((_link = wiphy_dereference((_local)->hw.wiphy, \ ___sdata->link[___link_id]))) static inline int ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems, struct cfg80211_rnr_elems *rnr_elems, u8 i) { int len = 0; if (!elems || !elems->cnt || i > elems->cnt) return 0; if (i < elems->cnt) { len = elems->elem[i].len; if (rnr_elems) { len += rnr_elems->elem[i].len; for (i = elems->cnt; i < rnr_elems->cnt; i++) len += rnr_elems->elem[i].len; } return len; } /* i == elems->cnt, calculate total length of all MBSSID elements */ for (i = 0; i < elems->cnt; i++) len += elems->elem[i].len; if (rnr_elems) { for (i = 0; i < rnr_elems->cnt; i++) len += rnr_elems->elem[i].len; } return len; } enum { IEEE80211_RX_MSG = 1, IEEE80211_TX_STATUS_MSG = 2, }; enum queue_stop_reason { IEEE80211_QUEUE_STOP_REASON_DRIVER, IEEE80211_QUEUE_STOP_REASON_PS, IEEE80211_QUEUE_STOP_REASON_CSA, IEEE80211_QUEUE_STOP_REASON_AGGREGATION, IEEE80211_QUEUE_STOP_REASON_SUSPEND, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, IEEE80211_QUEUE_STOP_REASON_FLUSH, IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID, IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE, IEEE80211_QUEUE_STOP_REASONS, }; #ifdef CONFIG_MAC80211_LEDS struct tpt_led_trigger { char name[32]; const struct ieee80211_tpt_blink *blink_table; unsigned int blink_table_len; struct timer_list timer; struct ieee80211_local *local; unsigned long prev_traffic; unsigned long tx_bytes, rx_bytes; unsigned int active, want; bool running; }; #endif /** * enum mac80211_scan_flags - currently active scan mode * * @SCAN_SW_SCANNING: We're currently in the process of scanning but may as * well be on the operating channel * @SCAN_HW_SCANNING: The hardware is scanning for us, we have no way to * determine if we are on the operating channel or not * @SCAN_ONCHANNEL_SCANNING: Do a software scan on only the current operating * channel. This should not interrupt normal traffic. * @SCAN_COMPLETED: Set for our scan work function when the driver reported * that the scan completed. * @SCAN_ABORTED: Set for our scan work function when the driver reported * a scan complete for an aborted scan. * @SCAN_HW_CANCELLED: Set for our scan work function when the scan is being * cancelled. * @SCAN_BEACON_WAIT: Set whenever we're passive scanning because of radar/no-IR * and could send a probe request after receiving a beacon. * @SCAN_BEACON_DONE: Beacon received, we can now send a probe request */ enum mac80211_scan_flags { SCAN_SW_SCANNING, SCAN_HW_SCANNING, SCAN_ONCHANNEL_SCANNING, SCAN_COMPLETED, SCAN_ABORTED, SCAN_HW_CANCELLED, SCAN_BEACON_WAIT, SCAN_BEACON_DONE, }; /** * enum mac80211_scan_state - scan state machine states * * @SCAN_DECISION: Main entry point to the scan state machine, this state * determines if we should keep on scanning or switch back to the * operating channel * @SCAN_SET_CHANNEL: Set the next channel to be scanned * @SCAN_SEND_PROBE: Send probe requests and wait for probe responses * @SCAN_SUSPEND: Suspend the scan and go back to operating channel to * send out data * @SCAN_RESUME: Resume the scan and scan the next channel * @SCAN_ABORT: Abort the scan and go back to operating channel */ enum mac80211_scan_state { SCAN_DECISION, SCAN_SET_CHANNEL, SCAN_SEND_PROBE, SCAN_SUSPEND, SCAN_RESUME, SCAN_ABORT, }; DECLARE_STATIC_KEY_FALSE(aql_disable); struct ieee80211_local { /* embed the driver visible part. * don't cast (use the static inlines below), but we keep * it first anyway so they become a no-op */ struct ieee80211_hw hw; struct fq fq; struct codel_vars *cvars; struct codel_params cparams; /* protects active_txqs and txqi->schedule_order */ spinlock_t active_txq_lock[IEEE80211_NUM_ACS]; struct list_head active_txqs[IEEE80211_NUM_ACS]; u16 schedule_round[IEEE80211_NUM_ACS]; /* serializes ieee80211_handle_wake_tx_queue */ spinlock_t handle_wake_tx_queue_lock; u16 airtime_flags; u32 aql_txq_limit_low[IEEE80211_NUM_ACS]; u32 aql_txq_limit_high[IEEE80211_NUM_ACS]; u32 aql_threshold; atomic_t aql_total_pending_airtime; atomic_t aql_ac_pending_airtime[IEEE80211_NUM_ACS]; const struct ieee80211_ops *ops; /* * private workqueue to mac80211. mac80211 makes this accessible * via ieee80211_queue_work() */ struct workqueue_struct *workqueue; unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES]; int q_stop_reasons[IEEE80211_MAX_QUEUES][IEEE80211_QUEUE_STOP_REASONS]; /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */ spinlock_t queue_stop_reason_lock; int open_count; int monitors, cooked_mntrs, tx_mntrs; /* number of interfaces with corresponding FIF_ flags */ int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, fif_probe_req; bool probe_req_reg; bool rx_mcast_action_reg; unsigned int filter_flags; /* FIF_* */ bool wiphy_ciphers_allocated; struct cfg80211_chan_def dflt_chandef; bool emulate_chanctx; /* protects the aggregated multicast list and filter calls */ spinlock_t filter_lock; /* used for uploading changed mc list */ struct wiphy_work reconfig_filter; /* aggregated multicast list */ struct netdev_hw_addr_list mc_list; bool tim_in_locked_section; /* see ieee80211_beacon_get() */ /* * suspended is true if we finished all the suspend _and_ we have * not yet come up from resume. This is to be used by mac80211 * to ensure driver sanity during suspend and mac80211's own * sanity. It can eventually be used for WoW as well. */ bool suspended; /* suspending is true during the whole suspend process */ bool suspending; /* * Resuming is true while suspended, but when we're reprogramming the * hardware -- at that time it's allowed to use ieee80211_queue_work() * again even though some other parts of the stack are still suspended * and we still drop received frames to avoid waking the stack. */ bool resuming; /* * quiescing is true during the suspend process _only_ to * ease timer cancelling etc. */ bool quiescing; /* device is started */ bool started; /* device is during a HW reconfig */ bool in_reconfig; /* reconfiguration failed ... suppress some warnings etc. */ bool reconfig_failure; /* wowlan is enabled -- don't reconfig on resume */ bool wowlan; struct wiphy_work radar_detected_work; /* number of RX chains the hardware has */ u8 rx_chains; /* bitmap of which sbands were copied */ u8 sband_allocated; int tx_headroom; /* required headroom for hardware/radiotap */ /* Tasklet and skb queue to process calls from IRQ mode. All frames * added to skb_queue will be processed, but frames in * skb_queue_unreliable may be dropped if the total length of these * queues increases over the limit. */ #define IEEE80211_IRQSAFE_QUEUE_LIMIT 128 struct tasklet_struct tasklet; struct sk_buff_head skb_queue; struct sk_buff_head skb_queue_unreliable; spinlock_t rx_path_lock; /* Station data */ /* * The list, hash table and counter are protected * by the wiphy mutex, reads are done with RCU. */ spinlock_t tim_lock; unsigned long num_sta; struct list_head sta_list; struct rhltable sta_hash; struct rhltable link_sta_hash; struct timer_list sta_cleanup; int sta_generation; struct sk_buff_head pending[IEEE80211_MAX_QUEUES]; struct tasklet_struct tx_pending_tasklet; struct tasklet_struct wake_txqs_tasklet; atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES]; /* number of interfaces with allmulti RX */ atomic_t iff_allmultis; struct rate_control_ref *rate_ctrl; struct arc4_ctx wep_tx_ctx; struct arc4_ctx wep_rx_ctx; u32 wep_iv; /* see iface.c */ struct list_head interfaces; struct list_head mon_list; /* only that are IFF_UP && !cooked */ struct mutex iflist_mtx; /* Scanning and BSS list */ unsigned long scanning; struct cfg80211_ssid scan_ssid; struct cfg80211_scan_request *int_scan_req; struct cfg80211_scan_request __rcu *scan_req; struct ieee80211_scan_request *hw_scan_req; struct cfg80211_chan_def scan_chandef; enum nl80211_band hw_scan_band; int scan_channel_idx; int scan_ies_len; int hw_scan_ies_bufsize; struct cfg80211_scan_info scan_info; struct wiphy_work sched_scan_stopped_work; struct ieee80211_sub_if_data __rcu *sched_scan_sdata; struct cfg80211_sched_scan_request __rcu *sched_scan_req; u8 scan_addr[ETH_ALEN]; unsigned long leave_oper_channel_time; enum mac80211_scan_state next_scan_state; struct wiphy_delayed_work scan_work; struct ieee80211_sub_if_data __rcu *scan_sdata; /* Temporary remain-on-channel for off-channel operations */ struct ieee80211_channel *tmp_channel; /* channel contexts */ struct list_head chanctx_list; #ifdef CONFIG_MAC80211_LEDS struct led_trigger tx_led, rx_led, assoc_led, radio_led; struct led_trigger tpt_led; atomic_t tx_led_active, rx_led_active, assoc_led_active; atomic_t radio_led_active, tpt_led_active; struct tpt_led_trigger *tpt_led_trigger; #endif #ifdef CONFIG_MAC80211_DEBUG_COUNTERS /* SNMP counters */ /* dot11CountersTable */ u32 dot11TransmittedFragmentCount; u32 dot11MulticastTransmittedFrameCount; u32 dot11FailedCount; u32 dot11RetryCount; u32 dot11MultipleRetryCount; u32 dot11FrameDuplicateCount; u32 dot11ReceivedFragmentCount; u32 dot11MulticastReceivedFrameCount; u32 dot11TransmittedFrameCount; /* TX/RX handler statistics */ unsigned int tx_handlers_drop; unsigned int tx_handlers_queued; unsigned int tx_handlers_drop_wep; unsigned int tx_handlers_drop_not_assoc; unsigned int tx_handlers_drop_unauth_port; unsigned int rx_handlers_drop; unsigned int rx_handlers_queued; unsigned int rx_handlers_drop_nullfunc; unsigned int rx_handlers_drop_defrag; unsigned int tx_expand_skb_head; unsigned int tx_expand_skb_head_cloned; unsigned int rx_expand_skb_head_defrag; unsigned int rx_handlers_fragments; unsigned int tx_status_drop; #define I802_DEBUG_INC(c) (c)++ #else /* CONFIG_MAC80211_DEBUG_COUNTERS */ #define I802_DEBUG_INC(c) do { } while (0) #endif /* CONFIG_MAC80211_DEBUG_COUNTERS */ int total_ps_buffered; /* total number of all buffered unicast and * multicast packets for power saving stations */ bool pspolling; /* * PS can only be enabled when we have exactly one managed * interface (and monitors) in PS, this then points there. */ struct ieee80211_sub_if_data *ps_sdata; struct wiphy_work dynamic_ps_enable_work; struct wiphy_work dynamic_ps_disable_work; struct timer_list dynamic_ps_timer; struct notifier_block ifa_notifier; struct notifier_block ifa6_notifier; /* * The dynamic ps timeout configured from user space via WEXT - * this will override whatever chosen by mac80211 internally. */ int dynamic_ps_forced_timeout; int user_power_level; /* in dBm, for all interfaces */ struct work_struct restart_work; #ifdef CONFIG_MAC80211_DEBUGFS struct local_debugfsdentries { struct dentry *rcdir; struct dentry *keys; } debugfs; bool force_tx_status; #endif /* * Remain-on-channel support */ struct wiphy_delayed_work roc_work; struct list_head roc_list; struct wiphy_work hw_roc_start, hw_roc_done; unsigned long hw_roc_start_time; u64 roc_cookie_counter; struct idr ack_status_frames; spinlock_t ack_status_lock; struct ieee80211_sub_if_data __rcu *p2p_sdata; /* virtual monitor interface */ struct ieee80211_sub_if_data __rcu *monitor_sdata; struct ieee80211_chan_req monitor_chanreq; /* extended capabilities provided by mac80211 */ u8 ext_capa[8]; bool wbrf_supported; }; static inline struct ieee80211_sub_if_data * IEEE80211_DEV_TO_SUB_IF(const struct net_device *dev) { return netdev_priv(dev); } static inline struct ieee80211_sub_if_data * IEEE80211_WDEV_TO_SUB_IF(struct wireless_dev *wdev) { return container_of(wdev, struct ieee80211_sub_if_data, wdev); } static inline struct ieee80211_supported_band * ieee80211_get_sband(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; enum nl80211_band band; WARN_ON(ieee80211_vif_is_mld(&sdata->vif)); rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); return NULL; } band = chanctx_conf->def.chan->band; rcu_read_unlock(); return local->hw.wiphy->bands[band]; } static inline struct ieee80211_supported_band * ieee80211_get_link_sband(struct ieee80211_link_data *link) { struct ieee80211_local *local = link->sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; enum nl80211_band band; rcu_read_lock(); chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); return NULL; } band = chanctx_conf->def.chan->band; rcu_read_unlock(); return local->hw.wiphy->bands[band]; } /* this struct holds the value parsing from channel switch IE */ struct ieee80211_csa_ie { struct ieee80211_chan_req chanreq; u8 mode; u8 count; u8 ttl; u16 pre_value; u16 reason_code; u32 max_switch_time; }; enum ieee80211_elems_parse_error { IEEE80211_PARSE_ERR_INVALID_END = BIT(0), IEEE80211_PARSE_ERR_DUP_ELEM = BIT(1), IEEE80211_PARSE_ERR_BAD_ELEM_SIZE = BIT(2), IEEE80211_PARSE_ERR_UNEXPECTED_ELEM = BIT(3), IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC = BIT(4), }; /* Parsed Information Elements */ struct ieee802_11_elems { const u8 *ie_start; size_t total_len; u32 crc; /* pointers to IEs */ const struct ieee80211_tdls_lnkie *lnk_id; const struct ieee80211_ch_switch_timing *ch_sw_timing; const u8 *ext_capab; const u8 *ssid; const u8 *supp_rates; const u8 *ds_params; const struct ieee80211_tim_ie *tim; const u8 *rsn; const u8 *rsnx; const u8 *erp_info; const u8 *ext_supp_rates; const u8 *wmm_info; const u8 *wmm_param; const struct ieee80211_ht_cap *ht_cap_elem; const struct ieee80211_ht_operation *ht_operation; const struct ieee80211_vht_cap *vht_cap_elem; const struct ieee80211_vht_operation *vht_operation; const struct ieee80211_meshconf_ie *mesh_config; const u8 *he_cap; const struct ieee80211_he_operation *he_operation; const struct ieee80211_he_spr *he_spr; const struct ieee80211_mu_edca_param_set *mu_edca_param_set; const struct ieee80211_he_6ghz_capa *he_6ghz_capa; const u8 *uora_element; const u8 *mesh_id; const u8 *peering; const __le16 *awake_window; const u8 *preq; const u8 *prep; const u8 *perr; const struct ieee80211_rann_ie *rann; const struct ieee80211_channel_sw_ie *ch_switch_ie; const struct ieee80211_ext_chansw_ie *ext_chansw_ie; const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; const u8 *max_channel_switch_time; const u8 *country_elem; const u8 *pwr_constr_elem; const u8 *cisco_dtpc_elem; const struct ieee80211_timeout_interval_ie *timeout_int; const u8 *opmode_notif; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; struct ieee80211_mesh_chansw_params_ie *mesh_chansw_params_ie; const struct ieee80211_bss_max_idle_period_ie *max_idle_period_ie; const struct ieee80211_multiple_bssid_configuration *mbssid_config_ie; const struct ieee80211_bssid_index *bssid_index; u8 max_bssid_indicator; u8 dtim_count; u8 dtim_period; const struct ieee80211_addba_ext_ie *addba_ext_ie; const struct ieee80211_s1g_cap *s1g_capab; const struct ieee80211_s1g_oper_ie *s1g_oper; const struct ieee80211_s1g_bcn_compat_ie *s1g_bcn_compat; const struct ieee80211_aid_response_ie *aid_resp; const struct ieee80211_eht_cap_elem *eht_cap; const struct ieee80211_eht_operation *eht_operation; const struct ieee80211_multi_link_elem *ml_basic; const struct ieee80211_multi_link_elem *ml_reconf; const struct ieee80211_multi_link_elem *ml_epcs; const struct ieee80211_bandwidth_indication *bandwidth_indication; const struct ieee80211_ttlm_elem *ttlm[IEEE80211_TTLM_MAX_CNT]; /* not the order in the psd values is per element, not per chandef */ struct ieee80211_parsed_tpe tpe; struct ieee80211_parsed_tpe csa_tpe; /* length of them, respectively */ u8 ext_capab_len; u8 ssid_len; u8 supp_rates_len; u8 tim_len; u8 rsn_len; u8 rsnx_len; u8 ext_supp_rates_len; u8 wmm_info_len; u8 wmm_param_len; u8 he_cap_len; u8 mesh_id_len; u8 peering_len; u8 preq_len; u8 prep_len; u8 perr_len; u8 country_elem_len; u8 bssid_index_len; u8 eht_cap_len; /* mult-link element can be de-fragmented and thus u8 is not sufficient */ size_t ml_basic_len; size_t ml_reconf_len; size_t ml_epcs_len; u8 ttlm_num; /* * store the per station profile pointer and length in case that the * parsing also handled Multi-Link element parsing for a specific link * ID. */ struct ieee80211_mle_per_sta_profile *prof; size_t sta_prof_len; /* whether/which parse error occurred while retrieving these elements */ u8 parse_error; }; static inline struct ieee80211_local *hw_to_local( struct ieee80211_hw *hw) { return container_of(hw, struct ieee80211_local, hw); } static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq) { return container_of(txq, struct txq_info, txq); } static inline bool txq_has_queue(struct ieee80211_txq *txq) { struct txq_info *txqi = to_txq_info(txq); return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets); } static inline bool ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status) { return status->flag & RX_FLAG_MACTIME; } void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_block_queues_csa(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_unblock_queues_csa(struct ieee80211_sub_if_data *sdata); /* This function returns the number of multicast stations connected to this * interface. It returns -1 if that number is not tracked, that is for netdevs * not in AP or AP_VLAN mode or when using 4addr. */ static inline int ieee80211_vif_get_num_mcast_if(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type == NL80211_IFTYPE_AP) return atomic_read(&sdata->u.ap.num_mcast_sta); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta) return atomic_read(&sdata->u.vlan.num_mcast_sta); return -1; } u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, struct ieee80211_rx_status *status, unsigned int mpdu_len, unsigned int mpdu_offset); int ieee80211_hw_config(struct ieee80211_local *local, u32 changed); int ieee80211_hw_conf_chan(struct ieee80211_local *local); void ieee80211_hw_conf_init(struct ieee80211_local *local); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx); void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, u64 changed); void ieee80211_vif_cfg_change_notify(struct ieee80211_sub_if_data *sdata, u64 changed); void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, u64 changed); void ieee80211_configure_filter(struct ieee80211_local *local); u64 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); void ieee80211_handle_queued_frames(struct ieee80211_local *local); u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local); int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u64 *cookie, gfp_t gfp); void ieee80211_check_fast_rx(struct sta_info *sta); void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_rx(struct sta_info *sta); bool ieee80211_is_our_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr, int *out_link_id); /* STA code */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, struct cfg80211_auth_request *req); int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_request *req); int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, struct cfg80211_deauth_request *req); int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_disassoc_request *req); void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_recalc_ps(struct ieee80211_local *local); void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata); void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, __le16 fc, bool acked); void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, u8 reason, bool tx); void ieee80211_mgd_setup_link(struct ieee80211_link_data *link); void ieee80211_mgd_stop_link(struct ieee80211_link_data *link); void ieee80211_mgd_set_link_qos_params(struct ieee80211_link_data *link); /* IBSS code */ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *addr, u32 supp_rates); int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, struct cfg80211_ibss_params *params); int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings, u64 *changed); int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed); void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata); /* OCB code */ void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata); void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *addr, u32 supp_rates); void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata, struct ocb_setup *setup); int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata); /* mesh code */ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata); void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings, u64 *changed); int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed); /* scan/BSS handling */ void ieee80211_scan_work(struct wiphy *wiphy, struct wiphy_work *work); int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, struct ieee80211_channel **channels, unsigned int n_channels); int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req); void ieee80211_scan_cancel(struct ieee80211_local *local); void ieee80211_run_deferred_scan(struct ieee80211_local *local); void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb); void ieee80211_inform_bss(struct wiphy *wiphy, struct cfg80211_bss *bss, const struct cfg80211_bss_ies *ies, void *data); void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local); struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_channel *channel); void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss); /* scheduled scan handling */ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req); int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req); int ieee80211_request_sched_scan_stop(struct ieee80211_local *local); void ieee80211_sched_scan_end(struct ieee80211_local *local); void ieee80211_sched_scan_stopped_work(struct wiphy *wiphy, struct wiphy_work *work); /* off-channel/mgmt-tx */ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local); void ieee80211_offchannel_return(struct ieee80211_local *local); void ieee80211_roc_setup(struct ieee80211_local *local); void ieee80211_start_next_roc(struct ieee80211_local *local); void ieee80211_reconfig_roc(struct ieee80211_local *local); void ieee80211_roc_purge(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie); int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie); int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie); int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie); /* channel switch handling */ void ieee80211_csa_finalize_work(struct wiphy *wiphy, struct wiphy_work *work); int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params); /* color change handling */ void ieee80211_color_change_finalize_work(struct wiphy *wiphy, struct wiphy_work *work); void ieee80211_color_collision_detection_work(struct wiphy *wiphy, struct wiphy_work *work); /* interface handling */ #define MAC80211_SUPPORTED_FEATURES_TX (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \ NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_TC) #define MAC80211_SUPPORTED_FEATURES_RX (NETIF_F_RXCSUM) #define MAC80211_SUPPORTED_FEATURES (MAC80211_SUPPORTED_FEATURES_TX | \ MAC80211_SUPPORTED_FEATURES_RX) int ieee80211_iface_init(void); void ieee80211_iface_exit(void); int ieee80211_if_add(struct ieee80211_local *local, const char *name, unsigned char name_assign_type, struct wireless_dev **new_wdev, enum nl80211_iftype type, struct vif_params *params); int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type); void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata); void ieee80211_remove_interfaces(struct ieee80211_local *local); u32 ieee80211_idle_off(struct ieee80211_local *local); void ieee80211_recalc_idle(struct ieee80211_local *local); void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, const int offset); int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up); void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata); int ieee80211_add_virtual_monitor(struct ieee80211_local *local); void ieee80211_del_virtual_monitor(struct ieee80211_local *local); bool __ieee80211_recalc_txpower(struct ieee80211_link_data *link); void ieee80211_recalc_txpower(struct ieee80211_link_data *link, bool update_bss); void ieee80211_recalc_offload(struct ieee80211_local *local); static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) { return test_bit(SDATA_STATE_RUNNING, &sdata->state); } /* link handling */ void ieee80211_link_setup(struct ieee80211_link_data *link); void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, int link_id, struct ieee80211_link_data *link, struct ieee80211_bss_conf *link_conf); void ieee80211_link_stop(struct ieee80211_link_data *link); int ieee80211_vif_set_links(struct ieee80211_sub_if_data *sdata, u16 new_links, u16 dormant_links); static inline void ieee80211_vif_clear_links(struct ieee80211_sub_if_data *sdata) { ieee80211_vif_set_links(sdata, 0, 0); } /* tx handling */ void ieee80211_clear_tx_pending(struct ieee80211_local *local); void ieee80211_tx_pending(struct tasklet_struct *t); netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct net_device *dev); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, u32 ctrl_flags, u64 *cookie); struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags); void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, int retry_count, bool send_to_cooked, struct ieee80211_tx_status *status); void ieee80211_check_fast_xmit(struct sta_info *sta); void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_xmit(struct sta_info *sta); int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, __be16 proto, bool unencrypted, int link_id, u64 *cookie); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb, bool ampdu, const u8 *da, const u8 *sa); void ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb); /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap *ht_cap); bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_ht_cap *ht_cap_ie, struct link_sta_info *link_sta); void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u16 initiator, u16 reason_code); int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid, int link_id); void ieee80211_add_addbaext(struct sk_buff *skb, const u8 req_addba_ext_data, u16 buf_size); u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta, const void *elem_data, ssize_t elem_len, u16 *buf_size); void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason, bool stop); void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq, const u8 addba_ext_data); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_process_addba_resp(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_process_addba_request(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); static inline struct ieee80211_mgmt * ieee80211_mgmt_ba(struct sk_buff *skb, const u8 *da, struct ieee80211_sub_if_data *sdata) { struct ieee80211_mgmt *mgmt = skb_put_zero(skb, 24); ether_addr_copy(mgmt->da, da); ether_addr_copy(mgmt->sa, sdata->vif.addr); if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN || sdata->vif.type == NL80211_IFTYPE_MESH_POINT) ether_addr_copy(mgmt->bssid, sdata->vif.addr); else if (sdata->vif.type == NL80211_IFTYPE_STATION) ether_addr_copy(mgmt->bssid, sdata->vif.cfg.ap_addr); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) ether_addr_copy(mgmt->bssid, sdata->u.ibss.bssid); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); return mgmt; } int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, enum ieee80211_agg_stop_reason reason); void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid, struct tid_ampdu_tx *tid_tx); void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid, struct tid_ampdu_tx *tid_tx); void ieee80211_ba_session_work(struct wiphy *wiphy, struct wiphy_work *work); void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid); void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid); u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs); enum nl80211_smps_mode ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps); /* VHT */ void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_vht_cap *vht_cap_ie, const struct ieee80211_vht_cap *vht_cap_ie2, struct link_sta_info *link_sta); enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta, struct cfg80211_chan_def *chandef); static inline enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta) { return _ieee80211_sta_cap_rx_bw(link_sta, NULL); } enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta, struct cfg80211_chan_def *chandef); static inline enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta) { return _ieee80211_sta_cur_vht_bw(link_sta, NULL); } void ieee80211_sta_init_nss(struct link_sta_info *link_sta); enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta); void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ieee80211_mgmt *mgmt); u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct link_sta_info *sta, u8 opmode, enum nl80211_band band); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct link_sta_info *sta, u8 opmode, enum nl80211_band band); void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap); void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, u16 vht_mask[NL80211_VHT_NSS_MAX]); enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct link_sta_info *sta); /* HE */ void ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, const struct ieee80211_he_6ghz_capa *he_6ghz_capa, struct link_sta_info *link_sta); void ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, const struct ieee80211_he_spr *he_spr_ie_elem); void ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, const struct ieee80211_he_operation *he_op_ie_elem); /* S1G */ void ieee80211_s1g_sta_rate_init(struct sta_info *sta); bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb); void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); /** * ieee80211_parse_ch_switch_ie - parses channel switch IEs * @sdata: the sdata of the interface which has received the frame * @elems: parsed 802.11 elements received with the frame * @current_band: indicates the current band * @vht_cap_info: VHT capabilities of the transmitter * @conn: contains information about own capabilities and restrictions * to decide which channel switch announcements can be accepted * @bssid: the currently connected bssid (for reporting) * @unprot_action: whether the frame was an unprotected frame or not, * used for reporting * @csa_ie: parsed 802.11 csa elements on count, mode, chandef and mesh ttl. * All of them will be filled with if success only. * Return: 0 on success, <0 on error and >0 if there is nothing to parse. */ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, u32 vht_cap_info, struct ieee80211_conn_settings *conn, u8 *bssid, bool unprot_action, struct ieee80211_csa_ie *csa_ie); /* Suspend/resume and hw reconfiguration */ int ieee80211_reconfig(struct ieee80211_local *local); void ieee80211_stop_device(struct ieee80211_local *local, bool suspend); int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan); static inline int __ieee80211_resume(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); WARN(test_bit(SCAN_HW_SCANNING, &local->scanning) && !test_bit(SCAN_COMPLETED, &local->scanning), "%s: resume with hardware scan still in progress\n", wiphy_name(hw->wiphy)); return ieee80211_reconfig(hw_to_local(hw)); } /* utility functions/constants */ extern const void *const mac80211_wiphy_privid; /* for wiphy privid */ const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode); enum ieee80211_conn_bw_limit ieee80211_min_bw_limit_from_chandef(struct cfg80211_chan_def *chandef); int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble); void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_queue_params *qparam, int ac); void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe); void ieee80211_set_wmm_default(struct ieee80211_link_data *link, bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, int link_id, enum nl80211_band band); /* sta_out needs to be checked for ERR_PTR() before using */ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct sta_info **sta_out); static inline void ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, enum nl80211_band band) { rcu_read_lock(); __ieee80211_tx_skb_tid_band(sdata, skb, tid, -1, band); rcu_read_unlock(); } void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, int link_id); static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { /* Send all internal mgmt frames on VO. Accordingly set TID to 7. */ ieee80211_tx_skb_tid(sdata, skb, 7, -1); } /** * struct ieee80211_elems_parse_params - element parsing parameters * @mode: connection mode for parsing * @start: pointer to the elements * @len: length of the elements * @action: %true if the elements came from an action frame * @filter: bitmap of element IDs to filter out while calculating * the element CRC * @crc: CRC starting value * @bss: the BSS to parse this as, for multi-BSSID cases this can * represent a non-transmitting BSS in which case the data * for that non-transmitting BSS is returned * @link_id: the link ID to parse elements for, if a STA profile * is present in the multi-link element, or -1 to ignore; * note that the code currently assumes parsing an association * (or re-association) response frame if this is given * @from_ap: frame is received from an AP (currently used only * for EHT capabilities parsing) */ struct ieee80211_elems_parse_params { enum ieee80211_conn_mode mode; const u8 *start; size_t len; bool action; u64 filter; u32 crc; struct cfg80211_bss *bss; int link_id; bool from_ap; }; struct ieee802_11_elems * ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params); static inline struct ieee802_11_elems * ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, u64 filter, u32 crc, struct cfg80211_bss *bss) { struct ieee80211_elems_parse_params params = { .mode = IEEE80211_CONN_MODE_HIGHEST, .start = start, .len = len, .action = action, .filter = filter, .crc = crc, .bss = bss, .link_id = -1, }; return ieee802_11_parse_elems_full(&params); } static inline struct ieee802_11_elems * ieee802_11_parse_elems(const u8 *start, size_t len, bool action, struct cfg80211_bss *bss) { return ieee802_11_parse_elems_crc(start, len, action, 0, 0, bss); } extern const int ieee802_1d_to_ac[8]; static inline int ieee80211_ac_from_tid(int tid) { return ieee802_1d_to_ac[tid & 7]; } void ieee80211_dynamic_ps_enable_work(struct wiphy *wiphy, struct wiphy_work *work); void ieee80211_dynamic_ps_disable_work(struct wiphy *wiphy, struct wiphy_work *work); void ieee80211_dynamic_ps_timer(struct timer_list *t); void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool powersave); void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, bool ack, u16 tx_time); unsigned int ieee80211_get_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted); void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted); void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted); void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted); static inline void ieee80211_stop_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason) { ieee80211_stop_queues_by_reason(&local->hw, ieee80211_get_vif_queues(local, sdata), reason, true); } static inline void ieee80211_wake_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason) { ieee80211_wake_queues_by_reason(&local->hw, ieee80211_get_vif_queues(local, sdata), reason, true); } static inline void ieee80211_stop_vif_queues_norefcount(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason) { ieee80211_stop_queues_by_reason(&local->hw, ieee80211_get_vif_queues(local, sdata), reason, false); } static inline void ieee80211_wake_vif_queues_norefcount(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason) { ieee80211_wake_queues_by_reason(&local->hw, ieee80211_get_vif_queues(local, sdata), reason, false); } void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb); void ieee80211_add_pending_skbs(struct ieee80211_local *local, struct sk_buff_head *skbs); void ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool drop); void __ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int queues, bool drop); static inline bool ieee80211_can_run_worker(struct ieee80211_local *local) { /* * It's unsafe to try to do any work during reconfigure flow. * When the flow ends the work will be requeued. */ if (local->in_reconfig) return false; /* * If quiescing is set, we are racing with __ieee80211_suspend. * __ieee80211_suspend flushes the workers after setting quiescing, * and we check quiescing / suspended before enqueuing new workers. * We should abort the worker to avoid the races below. */ if (local->quiescing) return false; /* * We might already be suspended if the following scenario occurs: * __ieee80211_suspend Control path * * if (local->quiescing) * return; * local->quiescing = true; * flush_workqueue(); * queue_work(...); * local->suspended = true; * local->quiescing = false; * worker starts running... */ if (local->suspended) return false; return true; } int ieee80211_txq_setup_flows(struct ieee80211_local *local); void ieee80211_txq_set_params(struct ieee80211_local *local); void ieee80211_txq_teardown_flows(struct ieee80211_local *local); void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct txq_info *txq, int tid); void ieee80211_txq_purge(struct ieee80211_local *local, struct txq_info *txqi); void ieee80211_purge_sta_txqs(struct sta_info *sta); void ieee80211_txq_remove_vlan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, struct txq_info *txqi); void ieee80211_wake_txqs(struct tasklet_struct *t); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, const u8 *da, const u8 *key, u8 key_len, u8 key_idx, u32 tx_flags); void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf); enum { IEEE80211_PROBE_FLAG_DIRECTED = BIT(0), IEEE80211_PROBE_FLAG_MIN_CONTENT = BIT(1), IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2), }; int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags); struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 flags); u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates); int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, enum ieee80211_smps_mode smps_mode); void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link); void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata, int link_id); size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset); u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 cap); u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode); void ieee80211_ie_build_wide_bw_cs(u8 *pos, const struct cfg80211_chan_def *chandef); u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap); u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef); u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata); u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef); u8 *ieee80211_ie_build_eht_oper(u8 *pos, const struct cfg80211_chan_def *chandef, const struct ieee80211_sta_eht_cap *eht_cap); int ieee80211_parse_bitrates(enum nl80211_chan_width width, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_s1g_cap *caps, struct sk_buff *skb); void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); /* element building in SKBs */ int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, u32 basic_rates, u32 rate_flags, u32 masked_rates, u8 element_id); int ieee80211_put_he_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn); int ieee80211_put_he_6ghz_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode); int ieee80211_put_eht_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn); /* channel management */ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef); bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef); void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info, struct cfg80211_chan_def *chandef); bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef); bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef); void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef, struct ieee80211_conn_settings *conn); static inline void ieee80211_chanreq_downgrade(struct ieee80211_chan_req *chanreq, struct ieee80211_conn_settings *conn) { ieee80211_chandef_downgrade(&chanreq->oper, conn); if (WARN_ON(!conn)) return; if (conn->mode < IEEE80211_CONN_MODE_EHT) chanreq->ap.chan = NULL; } bool ieee80211_chanreq_identical(const struct ieee80211_chan_req *a, const struct ieee80211_chan_req *b); int __must_check _ieee80211_link_use_channel(struct ieee80211_link_data *link, const struct ieee80211_chan_req *req, enum ieee80211_chanctx_mode mode, bool assign_on_failure); static inline int __must_check ieee80211_link_use_channel(struct ieee80211_link_data *link, const struct ieee80211_chan_req *req, enum ieee80211_chanctx_mode mode) { return _ieee80211_link_use_channel(link, req, mode, false); } int __must_check ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, const struct ieee80211_chan_req *req, enum ieee80211_chanctx_mode mode, bool radar_required); int __must_check ieee80211_link_use_reserved_context(struct ieee80211_link_data *link); int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link); int __must_check ieee80211_link_change_chanreq(struct ieee80211_link_data *link, const struct ieee80211_chan_req *req, u64 *changed); void __ieee80211_link_release_channel(struct ieee80211_link_data *link, bool skip_idle_recalc); void ieee80211_link_release_channel(struct ieee80211_link_data *link); void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link); void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, bool clear); int ieee80211_chanctx_refcount(struct ieee80211_local *local, struct ieee80211_chanctx *ctx); void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx); void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_link_data *rsvd_for, bool check_reserved); bool ieee80211_is_radar_required(struct ieee80211_local *local); void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work); void ieee80211_dfs_cac_cancel(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx); void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, struct wiphy_work *work); int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings); void ieee80211_recalc_dtim(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode chanmode, u8 radar_detect, int radio_idx); int ieee80211_max_num_channels(struct ieee80211_local *local, int radio_idx); u32 ieee80211_get_radio_mask(struct wiphy *wiphy, struct net_device *dev); void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, struct ieee80211_chanctx *ctx); /* TDLS */ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, int link_id, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, bool initiator, const u8 *extra_ies, size_t extra_ies_len); int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper); void ieee80211_tdls_peer_del_work(struct wiphy *wiphy, struct wiphy_work *wk); int ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev, const u8 *addr, u8 oper_class, struct cfg80211_chan_def *chandef); void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy, struct net_device *dev, const u8 *addr); void ieee80211_teardown_tdls_peers(struct ieee80211_link_data *link); void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *peer, u16 reason); void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); const char *ieee80211_get_reason_code_string(u16 reason_code); u16 ieee80211_encode_usf(int val); u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, enum nl80211_iftype type); extern const struct ethtool_ops ieee80211_ethtool_ops; u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *pubsta, int len, bool ampdu); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline #else #define debug_noinline #endif void ieee80211_init_frag_cache(struct ieee80211_fragment_cache *cache); void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache); u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata); void ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, const struct ieee80211_eht_cap_elem *eht_cap_ie_elem, u8 eht_cap_len, struct link_sta_info *link_sta); void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata, struct cfg80211_ttlm_params *params); void ieee80211_check_wbrf_support(struct ieee80211_local *local); void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef); void ieee80211_remove_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef); int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_link *add_links, u16 rem_links); void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); #if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST) #define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym) #define VISIBLE_IF_MAC80211_KUNIT ieee80211_rx_result ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx); int ieee80211_calc_chandef_subchan_offset(const struct cfg80211_chan_def *ap, u8 n_partial_subchans); void ieee80211_rearrange_tpe_psd(struct ieee80211_parsed_tpe_psd *psd, const struct cfg80211_chan_def *ap, const struct cfg80211_chan_def *used); #else #define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) #define VISIBLE_IF_MAC80211_KUNIT static #endif #endif /* IEEE80211_I_H */
1035 1036 1035 456 730 1035 1035 1035 1034 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 // SPDX-License-Identifier: GPL-2.0 /* * jump label x86 support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> * */ #include <linux/jump_label.h> #include <linux/memory.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/cpu.h> #include <asm/kprobes.h> #include <asm/alternative.h> #include <asm/text-patching.h> #include <asm/insn.h> int arch_jump_entry_size(struct jump_entry *entry) { struct insn insn = {}; insn_decode_kernel(&insn, (void *)jump_entry_code(entry)); BUG_ON(insn.length != 2 && insn.length != 5); return insn.length; } struct jump_label_patch { const void *code; int size; }; static struct jump_label_patch __jump_label_patch(struct jump_entry *entry, enum jump_label_type type) { const void *expect, *code, *nop; const void *addr, *dest; int size; addr = (void *)jump_entry_code(entry); dest = (void *)jump_entry_target(entry); size = arch_jump_entry_size(entry); switch (size) { case JMP8_INSN_SIZE: code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest); nop = x86_nops[size]; break; case JMP32_INSN_SIZE: code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); nop = x86_nops[size]; break; default: BUG(); } if (type == JUMP_LABEL_JMP) expect = nop; else expect = code; if (memcmp(addr, expect, size)) { /* * The location is not an op that we were expecting. * Something went wrong. Crash the box, as something could be * corrupting the kernel. */ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n", addr, addr, addr, expect, size, type); BUG(); } if (type == JUMP_LABEL_NOP) code = nop; return (struct jump_label_patch){.code = code, .size = size}; } static __always_inline void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { const struct jump_label_patch jlp = __jump_label_patch(entry, type); /* * As long as only a single processor is running and the code is still * not marked as RO, text_poke_early() can be used; Checking that * system_state is SYSTEM_BOOTING guarantees it. It will be set to * SYSTEM_SCHEDULING before other cores are awaken and before the * code is write-protected. * * At the time the change is being done, just ignore whether we * are doing nop -> jump or jump -> nop transition, and assume * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size); return; } text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { mutex_lock(&text_mutex); __jump_label_transform(entry, type, init); mutex_unlock(&text_mutex); } void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { jump_label_transform(entry, type, 0); } bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { struct jump_label_patch jlp; if (system_state == SYSTEM_BOOTING) { /* * Fallback to the non-batching mode. */ arch_jump_label_transform(entry, type); return true; } mutex_lock(&text_mutex); jlp = __jump_label_patch(entry, type); text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } void arch_jump_label_transform_apply(void) { mutex_lock(&text_mutex); text_poke_finish(); mutex_unlock(&text_mutex); }
2 5 3 2 1 1 1 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 /* * net/tipc/net.c: TIPC network routing code * * Copyright (c) 1995-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "net.h" #include "name_distr.h" #include "subscr.h" #include "socket.h" #include "node.h" #include "bcast.h" #include "link.h" #include "netlink.h" #include "monitor.h" /* * The TIPC locking policy is designed to ensure a very fine locking * granularity, permitting complete parallel access to individual * port and node/link instances. The code consists of four major * locking domains, each protected with their own disjunct set of locks. * * 1: The bearer level. * RTNL lock is used to serialize the process of configuring bearer * on update side, and RCU lock is applied on read side to make * bearer instance valid on both paths of message transmission and * reception. * * 2: The node and link level. * All node instances are saved into two tipc_node_list and node_htable * lists. The two lists are protected by node_list_lock on write side, * and they are guarded with RCU lock on read side. Especially node * instance is destroyed only when TIPC module is removed, and we can * confirm that there has no any user who is accessing the node at the * moment. Therefore, Except for iterating the two lists within RCU * protection, it's no needed to hold RCU that we access node instance * in other places. * * In addition, all members in node structure including link instances * are protected by node spin lock. * * 3: The transport level of the protocol. * This consists of the structures port, (and its user level * representations, such as user_port and tipc_sock), reference and * tipc_user (port.c, reg.c, socket.c). * * This layer has four different locks: * - The tipc_port spin_lock. This is protecting each port instance * from parallel data access and removal. Since we can not place * this lock in the port itself, it has been placed in the * corresponding reference table entry, which has the same life * cycle as the module. This entry is difficult to access from * outside the TIPC core, however, so a pointer to the lock has * been added in the port instance, -to be used for unlocking * only. * - A read/write lock to protect the reference table itself (teg.c). * (Nobody is using read-only access to this, so it can just as * well be changed to a spin_lock) * - A spin lock to protect the registry of kernel/driver users (reg.c) * - A global spin_lock (tipc_port_lock), which only task is to ensure * consistency where more than one port is involved in an operation, * i.e., when a port is part of a linked list of ports. * There are two such lists; 'port_list', which is used for management, * and 'wait_list', which is used to queue ports during congestion. * * 4: The name table (name_table.c, name_distr.c, subscription.c) * - There is one big read/write-lock (tipc_nametbl_lock) protecting the * overall name table structure. Nothing must be added/removed to * this structure without holding write access to it. * - There is one local spin_lock per sub_sequence, which can be seen * as a sub-domain to the tipc_nametbl_lock domain. It is used only * for translation operations, and is needed because a translation * steps the root of the 'publication' linked list between each lookup. * This is always used within the scope of a tipc_nametbl_lock(read). * - A local spin_lock protecting the queue of subscriber events. */ static void tipc_net_finalize(struct net *net, u32 addr); int tipc_net_init(struct net *net, u8 *node_id, u32 addr) { if (tipc_own_id(net)) { pr_info("Cannot configure node identity twice\n"); return -1; } pr_info("Started in network mode\n"); if (node_id) tipc_set_node_id(net, node_id); if (addr) tipc_net_finalize(net, addr); return 0; } static void tipc_net_finalize(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_socket_addr sk = {0, addr}; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE, TIPC_NODE_STATE, addr, addr); if (cmpxchg(&tn->node_addr, 0, addr)) return; tipc_set_node_addr(net, addr); tipc_named_reinit(net); tipc_sk_reinit(net); tipc_mon_reinit_self(net); tipc_nametbl_publish(net, &ua, &sk, addr); } void tipc_net_finalize_work(struct work_struct *work) { struct tipc_net *tn = container_of(work, struct tipc_net, work); tipc_net_finalize(tipc_link_net(tn->bcl), tn->trial_addr); } void tipc_net_stop(struct net *net) { if (!tipc_own_id(net)) return; rtnl_lock(); tipc_bearer_stop(net); tipc_node_stop(net); rtnl_unlock(); pr_info("Left network mode\n"); } static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = net_generic(net, tipc_net_id); u64 *w0 = (u64 *)&tn->node_id[0]; u64 *w1 = (u64 *)&tn->node_id[8]; struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NET_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NET); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NET_ID, tn->net_id)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID, *w0, 0)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID_W1, *w1, 0)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int err; int done = cb->args[0]; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; err = __tipc_nl_add_net(net, &msg); if (err) goto out; done = 1; out: cb->args[0] = done; return skb->len; } int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); int err; if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* Can't change net id once TIPC has joined a network */ if (tipc_own_addr(net)) return -EPERM; if (attrs[TIPC_NLA_NET_ID]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_NET_ID]); if (val < 1 || val > 9999) return -EINVAL; tn->net_id = val; } if (attrs[TIPC_NLA_NET_ADDR]) { u32 addr; addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; tn->legacy_addr_format = true; tipc_net_init(net, NULL, addr); } if (attrs[TIPC_NLA_NET_NODEID]) { u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); tipc_net_init(net, node_id, 0); } return 0; } int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_net_set(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_addr_legacy_get(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = tipc_net(net); struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_ADDR_LEGACY_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); if (!attrs) goto msg_full; if (tn->legacy_addr_format) if (nla_put_flag(msg->skb, TIPC_NLA_NET_ADDR_LEGACY)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; struct sk_buff *rep; int err; rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!rep) return -ENOMEM; msg.skb = rep; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_addr_legacy_get(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); }
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <asm/msr.h> #define CREATE_TRACE_POINTS #include <asm/msr-trace.h> struct msr __percpu *msrs_alloc(void) { struct msr __percpu *msrs = NULL; msrs = alloc_percpu(struct msr); if (!msrs) { pr_warn("%s: error allocating msrs\n", __func__); return NULL; } return msrs; } EXPORT_SYMBOL(msrs_alloc); void msrs_free(struct msr __percpu *msrs) { free_percpu(msrs); } EXPORT_SYMBOL(msrs_free); /** * msr_read - Read an MSR with error handling * @msr: MSR to read * @m: value to read into * * It returns read data only on success, otherwise it doesn't change the output * argument @m. * * Return: %0 for success, otherwise an error code */ static int msr_read(u32 msr, struct msr *m) { int err; u64 val; err = rdmsrl_safe(msr, &val); if (!err) m->q = val; return err; } /** * msr_write - Write an MSR with error handling * * @msr: MSR to write * @m: value to write * * Return: %0 for success, otherwise an error code */ static int msr_write(u32 msr, struct msr *m) { return wrmsrl_safe(msr, m->q); } static inline int __flip_bit(u32 msr, u8 bit, bool set) { struct msr m, m1; int err = -EINVAL; if (bit > 63) return err; err = msr_read(msr, &m); if (err) return err; m1 = m; if (set) m1.q |= BIT_64(bit); else m1.q &= ~BIT_64(bit); if (m1.q == m.q) return 0; err = msr_write(msr, &m1); if (err) return err; return 1; } /** * msr_set_bit - Set @bit in a MSR @msr. * @msr: MSR to write * @bit: bit number to set * * Return: * * < 0: An error was encountered. * * = 0: Bit was already set. * * > 0: Hardware accepted the MSR write. */ int msr_set_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, true); } /** * msr_clear_bit - Clear @bit in a MSR @msr. * @msr: MSR to write * @bit: bit number to clear * * Return: * * < 0: An error was encountered. * * = 0: Bit was already cleared. * * > 0: Hardware accepted the MSR write. */ int msr_clear_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, false); } #ifdef CONFIG_TRACEPOINTS void do_trace_write_msr(unsigned int msr, u64 val, int failed) { trace_write_msr(msr, val, failed); } EXPORT_SYMBOL(do_trace_write_msr); EXPORT_TRACEPOINT_SYMBOL(write_msr); void do_trace_read_msr(unsigned int msr, u64 val, int failed) { trace_read_msr(msr, val, failed); } EXPORT_SYMBOL(do_trace_read_msr); EXPORT_TRACEPOINT_SYMBOL(read_msr); void do_trace_rdpmc(unsigned counter, u64 val, int failed) { trace_rdpmc(counter, val, failed); } EXPORT_SYMBOL(do_trace_rdpmc); EXPORT_TRACEPOINT_SYMBOL(rdpmc); #endif
76 76 11 11 11 11 11 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Authors Mitsuru KANDA <mk@linux-ipv6.org> * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> * * Based on net/ipv4/xfrm4_tunnel.c */ #include <linux/module.h> #include <linux/xfrm.h> #include <linux/slab.h> #include <linux/rculist.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ipv6.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/mutex.h> #include <net/netns/generic.h> #define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 #define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 #define XFRM6_TUNNEL_SPI_MIN 1 #define XFRM6_TUNNEL_SPI_MAX 0xffffffff struct xfrm6_tunnel_net { struct hlist_head spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; struct hlist_head spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; u32 spi; }; static unsigned int xfrm6_tunnel_net_id __read_mostly; static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net) { return net_generic(net, xfrm6_tunnel_net_id); } /* * xfrm_tunnel_spi things are for allocating unique id ("spi") * per xfrm_address_t. */ struct xfrm6_tunnel_spi { struct hlist_node list_byaddr; struct hlist_node list_byspi; xfrm_address_t addr; u32 spi; refcount_t refcnt; struct rcu_head rcu_head; }; static DEFINE_SPINLOCK(xfrm6_tunnel_spi_lock); static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly; static inline unsigned int xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *addr) { unsigned int h; h = ipv6_addr_hash((const struct in6_addr *)addr); h ^= h >> 16; h ^= h >> 8; h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; return h; } static inline unsigned int xfrm6_tunnel_spi_hash_byspi(u32 spi) { return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE; } static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; hlist_for_each_entry_rcu(x6spi, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr, lockdep_is_held(&xfrm6_tunnel_spi_lock)) { if (xfrm6_addr_equal(&x6spi->addr, saddr)) return x6spi; } return NULL; } __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; rcu_read_lock_bh(); x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); spi = x6spi ? x6spi->spi : 0; rcu_read_unlock_bh(); return htonl(spi); } EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; int index = xfrm6_tunnel_spi_hash_byspi(spi); hlist_for_each_entry(x6spi, &xfrm6_tn->spi_byspi[index], list_byspi) { if (x6spi->spi == spi) return -1; } return index; } static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); u32 spi; struct xfrm6_tunnel_spi *x6spi; int index; if (xfrm6_tn->spi < XFRM6_TUNNEL_SPI_MIN || xfrm6_tn->spi >= XFRM6_TUNNEL_SPI_MAX) xfrm6_tn->spi = XFRM6_TUNNEL_SPI_MIN; else xfrm6_tn->spi++; for (spi = xfrm6_tn->spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { index = __xfrm6_tunnel_spi_check(net, spi); if (index >= 0) goto alloc_spi; if (spi == XFRM6_TUNNEL_SPI_MAX) break; } for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) { index = __xfrm6_tunnel_spi_check(net, spi); if (index >= 0) goto alloc_spi; } spi = 0; goto out; alloc_spi: xfrm6_tn->spi = spi; x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC); if (!x6spi) goto out; memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr)); x6spi->spi = spi; refcount_set(&x6spi->refcnt, 1); hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tn->spi_byspi[index]); index = xfrm6_tunnel_spi_hash_byaddr(saddr); hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tn->spi_byaddr[index]); out: return spi; } __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; spin_lock_bh(&xfrm6_tunnel_spi_lock); x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); if (x6spi) { refcount_inc(&x6spi->refcnt); spi = x6spi->spi; } else spi = __xfrm6_tunnel_alloc_spi(net, saddr); spin_unlock_bh(&xfrm6_tunnel_spi_lock); return htonl(spi); } EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi); static void x6spi_destroy_rcu(struct rcu_head *head) { kmem_cache_free(xfrm6_tunnel_spi_kmem, container_of(head, struct xfrm6_tunnel_spi, rcu_head)); } static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; struct hlist_node *n; spin_lock_bh(&xfrm6_tunnel_spi_lock); hlist_for_each_entry_safe(x6spi, n, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { if (xfrm6_addr_equal(&x6spi->addr, saddr)) { if (refcount_dec_and_test(&x6spi->refcnt)) { hlist_del_rcu(&x6spi->list_byaddr); hlist_del_rcu(&x6spi->list_byspi); call_rcu(&x6spi->rcu_head, x6spi_destroy_rcu); break; } } } spin_unlock_bh(&xfrm6_tunnel_spi_lock); } static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { skb_push(skb, -skb_network_offset(skb)); return 0; } static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { return skb_network_header(skb)[IP6CB(skb)->nhoff]; } static int xfrm6_tunnel_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct ipv6hdr *iph = ipv6_hdr(skb); __be32 spi; spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { /* xfrm6_tunnel native err handling */ switch (type) { case ICMPV6_DEST_UNREACH: switch (code) { case ICMPV6_NOROUTE: case ICMPV6_ADM_PROHIBITED: case ICMPV6_NOT_NEIGHBOUR: case ICMPV6_ADDR_UNREACH: case ICMPV6_PORT_UNREACH: default: break; } break; case ICMPV6_PKT_TOOBIG: break; case ICMPV6_TIME_EXCEED: switch (code) { case ICMPV6_EXC_HOPLIMIT: break; case ICMPV6_EXC_FRAGTIME: default: break; } break; case ICMPV6_PARAMPROB: switch (code) { case ICMPV6_HDR_FIELD: break; case ICMPV6_UNK_NEXTHDR: break; case ICMPV6_UNK_OPTION: break; } break; default: break; } return 0; } static int xfrm6_tunnel_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->props.mode != XFRM_MODE_TUNNEL) { NL_SET_ERR_MSG(extack, "IPv6 tunnel can only be used with tunnel mode"); return -EINVAL; } if (x->encap) { NL_SET_ERR_MSG(extack, "IPv6 tunnel is not compatible with encapsulation"); return -EINVAL; } x->props.header_len = sizeof(struct ipv6hdr); return 0; } static void xfrm6_tunnel_destroy(struct xfrm_state *x) { struct net *net = xs_net(x); xfrm6_tunnel_free_spi(net, (xfrm_address_t *)&x->props.saddr); } static const struct xfrm_type xfrm6_tunnel_type = { .owner = THIS_MODULE, .proto = IPPROTO_IPV6, .init_state = xfrm6_tunnel_init_state, .destructor = xfrm6_tunnel_destroy, .input = xfrm6_tunnel_input, .output = xfrm6_tunnel_output, }; static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 3, }; static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 3, }; static int __net_init xfrm6_tunnel_net_init(struct net *net) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) INIT_HLIST_HEAD(&xfrm6_tn->spi_byaddr[i]); for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) INIT_HLIST_HEAD(&xfrm6_tn->spi_byspi[i]); xfrm6_tn->spi = 0; return 0; } static void __net_exit xfrm6_tunnel_net_exit(struct net *net) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; xfrm_flush_gc(); xfrm_state_flush(net, 0, false, true); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byspi[i])); } static struct pernet_operations xfrm6_tunnel_net_ops = { .init = xfrm6_tunnel_net_init, .exit = xfrm6_tunnel_net_exit, .id = &xfrm6_tunnel_net_id, .size = sizeof(struct xfrm6_tunnel_net), }; static int __init xfrm6_tunnel_init(void) { int rv; xfrm6_tunnel_spi_kmem = KMEM_CACHE(xfrm6_tunnel_spi, SLAB_HWCACHE_ALIGN); if (!xfrm6_tunnel_spi_kmem) return -ENOMEM; rv = register_pernet_subsys(&xfrm6_tunnel_net_ops); if (rv < 0) goto out_pernet; rv = xfrm_register_type(&xfrm6_tunnel_type, AF_INET6); if (rv < 0) goto out_type; rv = xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6); if (rv < 0) goto out_xfrm6; rv = xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET); if (rv < 0) goto out_xfrm46; return 0; out_xfrm46: xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); out_xfrm6: xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); out_type: unregister_pernet_subsys(&xfrm6_tunnel_net_ops); out_pernet: kmem_cache_destroy(xfrm6_tunnel_spi_kmem); return rv; } static void __exit xfrm6_tunnel_fini(void) { xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET); xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); unregister_pernet_subsys(&xfrm6_tunnel_net_ops); /* Someone maybe has gotten the xfrm6_tunnel_spi. * So need to wait it. */ rcu_barrier(); kmem_cache_destroy(xfrm6_tunnel_spi_kmem); } module_init(xfrm6_tunnel_init); module_exit(xfrm6_tunnel_fini); MODULE_DESCRIPTION("IPv6 XFRM tunnel driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6);
1057 1057 1057 1057 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine (KVM) Hypervisor * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> */ #include <kvm/iodev.h> #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/percpu.h> #include <linux/mm.h> #include <linux/miscdevice.h> #include <linux/vmalloc.h> #include <linux/reboot.h> #include <linux/debugfs.h> #include <linux/highmem.h> #include <linux/file.h> #include <linux/syscore_ops.h> #include <linux/cpu.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/cpumask.h> #include <linux/smp.h> #include <linux/anon_inodes.h> #include <linux/profile.h> #include <linux/kvm_para.h> #include <linux/pagemap.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/compat.h> #include <linux/srcu.h> #include <linux/hugetlb.h> #include <linux/slab.h> #include <linux/sort.h> #include <linux/bsearch.h> #include <linux/io.h> #include <linux/lockdep.h> #include <linux/kthread.h> #include <linux/suspend.h> #include <asm/processor.h> #include <asm/ioctl.h> #include <linux/uaccess.h> #include "coalesced_mmio.h" #include "async_pf.h" #include "kvm_mm.h" #include "vfio.h" #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> #include <linux/kvm_dirty_ring.h> /* Worst case buffer size needed for holding an integer. */ #define ITOA_MAX_LEN 12 MODULE_AUTHOR("Qumranet"); MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor"); MODULE_LICENSE("GPL"); /* Architectures should define their poll value according to the halt latency */ unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; module_param(halt_poll_ns, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns); /* Default doubles per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_grow = 2; module_param(halt_poll_ns_grow, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_grow); /* The start value to grow halt_poll_ns from */ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ module_param(halt_poll_ns_grow_start, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); /* Default halves per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); /* * Allow direct access (from KVM or the CPU) without MMU notifier protection * to unpinned pages. */ static bool allow_unsafe_mappings; module_param(allow_unsafe_mappings, bool, 0444); /* * Ordering of locks: * * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ DEFINE_MUTEX(kvm_lock); LIST_HEAD(vm_list); static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); static struct dentry *kvm_debugfs_dir; static const struct file_operations stat_fops_per_vm; static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #ifdef CONFIG_KVM_COMPAT static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #define KVM_COMPAT(c) .compat_ioctl = (c) #else /* * For architectures that don't implement a compat infrastructure, * adopt a double line of defense: * - Prevent a compat task from opening /dev/kvm * - If the open has been done by a 64bit task, and the KVM fd * passed to a compat task, let the ioctls fail. */ static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { return -EINVAL; } static int kvm_no_compat_open(struct inode *inode, struct file *file) { return is_compat_task() ? -ENODEV : 0; } #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ .open = kvm_no_compat_open #endif static int kvm_enable_virtualization(void); static void kvm_disable_virtualization(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); #define KVM_EVENT_CREATE_VM 0 #define KVM_EVENT_DESTROY_VM 1 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms; static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask); __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } /* * Switches to specified vcpu, until a matching vcpu_put() */ void vcpu_load(struct kvm_vcpu *vcpu) { int cpu = get_cpu(); __this_cpu_write(kvm_running_vcpu, vcpu); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); } EXPORT_SYMBOL_GPL(vcpu_load); void vcpu_put(struct kvm_vcpu *vcpu) { preempt_disable(); kvm_arch_vcpu_put(vcpu); preempt_notifier_unregister(&vcpu->preempt_notifier); __this_cpu_write(kvm_running_vcpu, NULL); preempt_enable(); } EXPORT_SYMBOL_GPL(vcpu_put); /* TODO: merge with kvm_arch_vcpu_should_kick */ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) { int mode = kvm_vcpu_exiting_guest_mode(vcpu); /* * We need to wait for the VCPU to reenable interrupts and get out of * READING_SHADOW_PAGE_TABLES mode. */ if (req & KVM_REQUEST_WAIT) return mode != OUTSIDE_GUEST_MODE; /* * Need to kick a running VCPU, but otherwise there is nothing to do. */ return mode == IN_GUEST_MODE; } static void ack_kick(void *_completed) { } static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait) { if (cpumask_empty(cpus)) return false; smp_call_function_many(cpus, ack_kick, NULL, wait); return true; } static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req, struct cpumask *tmp, int current_cpu) { int cpu; if (likely(!(req & KVM_REQUEST_NO_ACTION))) __kvm_make_request(req, vcpu); if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) return; /* * Note, the vCPU could get migrated to a different pCPU at any point * after kvm_request_needs_ipi(), which could result in sending an IPI * to the previous pCPU. But, that's OK because the purpose of the IPI * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES * after this point is also OK, as the requirement is only that KVM wait * for vCPUs that were reading SPTEs _before_ any changes were * finalized. See kvm_vcpu_kick() for more details on handling requests. */ if (kvm_request_needs_ipi(vcpu, req)) { cpu = READ_ONCE(vcpu->cpu); if (cpu != -1 && cpu != current_cpu) __cpumask_set_cpu(cpu, tmp); } } bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, unsigned long *vcpu_bitmap) { struct kvm_vcpu *vcpu; struct cpumask *cpus; int i, me; bool called; me = get_cpu(); cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); cpumask_clear(cpus); for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) { vcpu = kvm_get_vcpu(kvm, i); if (!vcpu) continue; kvm_make_vcpu_request(vcpu, req, cpus, me); } called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); put_cpu(); return called; } bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { struct kvm_vcpu *vcpu; struct cpumask *cpus; unsigned long i; bool called; int me; me = get_cpu(); cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); cpumask_clear(cpus); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_vcpu_request(vcpu, req, cpus, me); called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); put_cpu(); return called; } EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); void kvm_flush_remote_tlbs(struct kvm *kvm) { ++kvm->stat.generic.remote_tlb_flush_requests; /* * We want to publish modifications to the page tables before reading * mode. Pairs with a memory barrier in arch-specific code. * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest * and smp_mb in walk_shadow_page_lockless_begin/end. * - powerpc: smp_mb in kvmppc_prepare_to_enter. * * There is already an smp_mb__after_atomic() before * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that * barrier here. */ if (!kvm_arch_flush_remote_tlbs(kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.generic.remote_tlb_flush; } EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages)) return; /* * Fall back to a flushing entire TLBs if the architecture range-based * TLB invalidation is unsupported or can't be performed for whatever * reason. */ kvm_flush_remote_tlbs(kvm); } void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, const struct kvm_memory_slot *memslot) { /* * All current use cases for flushing the TLBs for a specific memslot * are related to dirty logging, and many do the TLB flush out of * mmu_lock. The interaction between the various operations on memslot * must be serialized by slots_locks to ensure the TLB flush from one * operation is observed by any other operation on the same memslot. */ lockdep_assert_held(&kvm->slots_lock); kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages); } static void kvm_flush_shadow_all(struct kvm *kvm) { kvm_arch_flush_shadow_all(kvm); kvm_arch_guest_memory_reclaimed(kvm); } #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, gfp_t gfp_flags) { void *page; gfp_flags |= mc->gfp_zero; if (mc->kmem_cache) return kmem_cache_alloc(mc->kmem_cache, gfp_flags); page = (void *)__get_free_page(gfp_flags); if (page && mc->init_value) memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64)); return page; } int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) { gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT; void *obj; if (mc->nobjs >= min) return 0; if (unlikely(!mc->objects)) { if (WARN_ON_ONCE(!capacity)) return -EIO; /* * Custom init values can be used only for page allocations, * and obviously conflict with __GFP_ZERO. */ if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero))) return -EIO; mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp); if (!mc->objects) return -ENOMEM; mc->capacity = capacity; } /* It is illegal to request a different capacity across topups. */ if (WARN_ON_ONCE(mc->capacity != capacity)) return -EIO; while (mc->nobjs < mc->capacity) { obj = mmu_memory_cache_alloc_obj(mc, gfp); if (!obj) return mc->nobjs >= min ? 0 : -ENOMEM; mc->objects[mc->nobjs++] = obj; } return 0; } int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) { return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min); } int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) { return mc->nobjs; } void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) { if (mc->kmem_cache) kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); else free_page((unsigned long)mc->objects[--mc->nobjs]); } kvfree(mc->objects); mc->objects = NULL; mc->capacity = 0; } void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { void *p; if (WARN_ON(!mc->nobjs)) p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); else p = mc->objects[--mc->nobjs]; BUG_ON(!p); return p; } #endif static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { mutex_init(&vcpu->mutex); vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; rwlock_init(&vcpu->pid_lock); #ifndef __KVM_HAVE_ARCH_WQP rcuwait_init(&vcpu->wait); #endif kvm_async_pf_vcpu_init(vcpu); kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); vcpu->last_used_slot = NULL; /* Fill the stats id string for the vcpu */ snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", task_pid_nr(current), id); } static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { kvm_arch_vcpu_destroy(vcpu); kvm_dirty_ring_free(&vcpu->dirty_ring); /* * No need for rcu_read_lock as VCPU_RUN is the only place that changes * the vcpu->pid pointer, and at destruction time all file descriptors * are already gone. */ put_pid(vcpu->pid); free_page((unsigned long)vcpu->run); kmem_cache_free(kvm_vcpu_cache, vcpu); } void kvm_destroy_vcpus(struct kvm *kvm) { unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { kvm_vcpu_destroy(vcpu); xa_erase(&kvm->vcpu_array, i); } atomic_set(&kvm->online_vcpus, 0); } EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { return container_of(mn, struct kvm, mmu_notifier); } typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm); struct kvm_mmu_notifier_range { /* * 64-bit addresses, as KVM notifiers can operate on host virtual * addresses (unsigned long) and guest physical addresses (64-bit). */ u64 start; u64 end; union kvm_mmu_notifier_arg arg; gfn_handler_t handler; on_lock_fn_t on_lock; bool flush_on_ret; bool may_block; }; /* * The inner-most helper returns a tuple containing the return value from the * arch- and action-specific handler, plus a flag indicating whether or not at * least one memslot was found, i.e. if the handler found guest memory. * * Note, most notifiers are averse to booleans, so even though KVM tracks the * return from arch code as a bool, outer helpers will cast it to an int. :-( */ typedef struct kvm_mmu_notifier_return { bool ret; bool found_memslot; } kvm_mn_ret_t; /* * Use a dedicated stub instead of NULL to indicate that there is no callback * function/handler. The compiler technically can't guarantee that a real * function will have a non-zero address, and so it will generate code to * check for !NULL, whereas comparing against a stub will be elided at compile * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9). */ static void kvm_null_fn(void) { } #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) /* Iterate over each memslot intersecting [start, last] (inclusive) range */ #define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \ for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ node; \ node = interval_tree_iter_next(node, start, last)) \ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, const struct kvm_mmu_notifier_range *range) { struct kvm_mmu_notifier_return r = { .ret = false, .found_memslot = false, }; struct kvm_gfn_range gfn_range; struct kvm_memory_slot *slot; struct kvm_memslots *slots; int i, idx; if (WARN_ON_ONCE(range->end <= range->start)) return r; /* A null handler is allowed if and only if on_lock() is provided. */ if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && IS_KVM_NULL_FN(range->handler))) return r; idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct interval_tree_node *node; slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_hva_range(node, slots, range->start, range->end - 1) { unsigned long hva_start, hva_end; slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]); hva_start = max_t(unsigned long, range->start, slot->userspace_addr); hva_end = min_t(unsigned long, range->end, slot->userspace_addr + (slot->npages << PAGE_SHIFT)); /* * To optimize for the likely case where the address * range is covered by zero or one memslots, don't * bother making these conditional (to avoid writes on * the second or later invocation of the handler). */ gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; /* * HVA-based notifications aren't relevant to private * mappings as they don't have a userspace mapping. */ gfn_range.attr_filter = KVM_FILTER_SHARED; /* * {gfn(page) | page intersects with [hva_start, hva_end)} = * {gfn_start, gfn_start+1, ..., gfn_end-1}. */ gfn_range.start = hva_to_gfn_memslot(hva_start, slot); gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); gfn_range.slot = slot; if (!r.found_memslot) { r.found_memslot = true; KVM_MMU_LOCK(kvm); if (!IS_KVM_NULL_FN(range->on_lock)) range->on_lock(kvm); if (IS_KVM_NULL_FN(range->handler)) goto mmu_unlock; } r.ret |= range->handler(kvm, &gfn_range); } } if (range->flush_on_ret && r.ret) kvm_flush_remote_tlbs(kvm); mmu_unlock: if (r.found_memslot) KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return r; } static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, unsigned long start, unsigned long end, gfn_handler_t handler, bool flush_on_ret) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range range = { .start = start, .end = end, .handler = handler, .on_lock = (void *)kvm_null_fn, .flush_on_ret = flush_on_ret, .may_block = false, }; return __kvm_handle_hva_range(kvm, &range).ret; } static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, unsigned long start, unsigned long end, gfn_handler_t handler) { return kvm_handle_hva_range(mn, start, end, handler, false); } void kvm_mmu_invalidate_begin(struct kvm *kvm) { lockdep_assert_held_write(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and * count is also read inside the mmu_lock critical section. */ kvm->mmu_invalidate_in_progress++; if (likely(kvm->mmu_invalidate_in_progress == 1)) { kvm->mmu_invalidate_range_start = INVALID_GPA; kvm->mmu_invalidate_range_end = INVALID_GPA; } } void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end) { lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress); if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) { kvm->mmu_invalidate_range_start = start; kvm->mmu_invalidate_range_end = end; } else { /* * Fully tracking multiple concurrent ranges has diminishing * returns. Keep things simple and just find the minimal range * which includes the current and new ranges. As there won't be * enough information to subtract a range after its invalidate * completes, any ranges invalidated concurrently will * accumulate and persist until all outstanding invalidates * complete. */ kvm->mmu_invalidate_range_start = min(kvm->mmu_invalidate_range_start, start); kvm->mmu_invalidate_range_end = max(kvm->mmu_invalidate_range_end, end); } } bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_mmu_invalidate_range_add(kvm, range->start, range->end); return kvm_unmap_gfn_range(kvm, range); } static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, .handler = kvm_mmu_unmap_gfn_range, .on_lock = kvm_mmu_invalidate_begin, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), }; trace_kvm_unmap_hva_range(range->start, range->end); /* * Prevent memslot modification between range_start() and range_end() * so that conditionally locking provides the same result in both * functions. Without that guarantee, the mmu_invalidate_in_progress * adjustments will be imbalanced. * * Pairs with the decrement in range_end(). */ spin_lock(&kvm->mn_invalidate_lock); kvm->mn_active_invalidate_count++; spin_unlock(&kvm->mn_invalidate_lock); /* * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e. * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring * each cache's lock. There are relatively few caches in existence at * any given time, and the caches themselves can check for hva overlap, * i.e. don't need to rely on memslot overlap checks for performance. * Because this runs without holding mmu_lock, the pfn caches must use * mn_active_invalidate_count (see above) instead of * mmu_invalidate_in_progress. */ gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end); /* * If one or more memslots were found and thus zapped, notify arch code * that guest memory has been reclaimed. This needs to be done *after* * dropping mmu_lock, as x86's reclaim path is slooooow. */ if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot) kvm_arch_guest_memory_reclaimed(kvm); return 0; } void kvm_mmu_invalidate_end(struct kvm *kvm) { lockdep_assert_held_write(&kvm->mmu_lock); /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have * been freed. */ kvm->mmu_invalidate_seq++; smp_wmb(); /* * The above sequence increase must be visible before the * below count decrease, which is ensured by the smp_wmb above * in conjunction with the smp_rmb in mmu_invalidate_retry(). */ kvm->mmu_invalidate_in_progress--; KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm); /* * Assert that at least one range was added between start() and end(). * Not adding a range isn't fatal, but it is a KVM bug. */ WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA); } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, .handler = (void *)kvm_null_fn, .on_lock = kvm_mmu_invalidate_end, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; bool wake; __kvm_handle_hva_range(kvm, &hva_range); /* Pairs with the increment in range_start(). */ spin_lock(&kvm->mn_invalidate_lock); if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count)) --kvm->mn_active_invalidate_count; wake = !kvm->mn_active_invalidate_count; spin_unlock(&kvm->mn_invalidate_lock); /* * There can only be one waiter, since the wait happens under * slots_lock. */ if (wake) rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); } static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { trace_kvm_age_hva(start, end); return kvm_handle_hva_range(mn, start, end, kvm_age_gfn, !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { trace_kvm_age_hva(start, end); /* * Even though we do not flush TLB, this will still adversely * affect performance on pre-Haswell Intel EPT, where there is * no EPT Access Bit to clear so that we have to tear down EPT * tables instead. If we find this unacceptable, we can always * add a parameter to kvm_age_hva so that it effectively doesn't * do anything on clear_young. * * Also note that currently we never issue secondary TLB flushes * from clear_young, leaving this job up to the regular system * cadence. If we find this inaccurate, we might come up with a * more sophisticated heuristic later. */ return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); } static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address) { trace_kvm_test_age_hva(address); return kvm_handle_hva_range_no_flush(mn, address, address + 1, kvm_test_age_gfn); } static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int idx; idx = srcu_read_lock(&kvm->srcu); kvm_flush_shadow_all(kvm); srcu_read_unlock(&kvm->srcu, idx); } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, .clear_young = kvm_mmu_notifier_clear_young, .test_young = kvm_mmu_notifier_test_young, .release = kvm_mmu_notifier_release, }; static int kvm_init_mmu_notifier(struct kvm *kvm) { kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; return mmu_notifier_register(&kvm->mmu_notifier, current->mm); } #else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */ static int kvm_init_mmu_notifier(struct kvm *kvm) { return 0; } #endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */ #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_pm_notifier_call(struct notifier_block *bl, unsigned long state, void *unused) { struct kvm *kvm = container_of(bl, struct kvm, pm_notifier); return kvm_arch_pm_notifier(kvm, state); } static void kvm_init_pm_notifier(struct kvm *kvm) { kvm->pm_notifier.notifier_call = kvm_pm_notifier_call; /* Suspend KVM before we suspend ftrace, RCU, etc. */ kvm->pm_notifier.priority = INT_MAX; register_pm_notifier(&kvm->pm_notifier); } static void kvm_destroy_pm_notifier(struct kvm *kvm) { unregister_pm_notifier(&kvm->pm_notifier); } #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */ static void kvm_init_pm_notifier(struct kvm *kvm) { } static void kvm_destroy_pm_notifier(struct kvm *kvm) { } #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) { if (!memslot->dirty_bitmap) return; vfree(memslot->dirty_bitmap); memslot->dirty_bitmap = NULL; } /* This does not remove the slot from struct kvm_memslots data structures */ static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { if (slot->flags & KVM_MEM_GUEST_MEMFD) kvm_gmem_unbind(slot); kvm_destroy_dirty_bitmap(slot); kvm_arch_free_memslot(kvm, slot); kfree(slot); } static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) { struct hlist_node *idnode; struct kvm_memory_slot *memslot; int bkt; /* * The same memslot objects live in both active and inactive sets, * arbitrarily free using index '1' so the second invocation of this * function isn't operating over a structure with dangling pointers * (even though this function isn't actually touching them). */ if (!slots->node_idx) return; hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1]) kvm_free_memslot(kvm, memslot); } static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc) { switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) { case KVM_STATS_TYPE_INSTANT: return 0444; case KVM_STATS_TYPE_CUMULATIVE: case KVM_STATS_TYPE_PEAK: default: return 0644; } } static void kvm_destroy_vm_debugfs(struct kvm *kvm) { int i; int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; if (IS_ERR(kvm->debugfs_dentry)) return; debugfs_remove_recursive(kvm->debugfs_dentry); if (kvm->debugfs_stat_data) { for (i = 0; i < kvm_debugfs_num_entries; i++) kfree(kvm->debugfs_stat_data[i]); kfree(kvm->debugfs_stat_data); } } static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname) { static DEFINE_MUTEX(kvm_debugfs_lock); struct dentry *dent; char dir_name[ITOA_MAX_LEN * 2]; struct kvm_stat_data *stat_data; const struct _kvm_stats_desc *pdesc; int i, ret = -ENOMEM; int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; if (!debugfs_initialized()) return 0; snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname); mutex_lock(&kvm_debugfs_lock); dent = debugfs_lookup(dir_name, kvm_debugfs_dir); if (dent) { pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name); dput(dent); mutex_unlock(&kvm_debugfs_lock); return 0; } dent = debugfs_create_dir(dir_name, kvm_debugfs_dir); mutex_unlock(&kvm_debugfs_lock); if (IS_ERR(dent)) return 0; kvm->debugfs_dentry = dent; kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, sizeof(*kvm->debugfs_stat_data), GFP_KERNEL_ACCOUNT); if (!kvm->debugfs_stat_data) goto out_err; for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { pdesc = &kvm_vm_stats_desc[i]; stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) goto out_err; stat_data->kvm = kvm; stat_data->desc = pdesc; stat_data->kind = KVM_STAT_VM; kvm->debugfs_stat_data[i] = stat_data; debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), kvm->debugfs_dentry, stat_data, &stat_fops_per_vm); } for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) { pdesc = &kvm_vcpu_stats_desc[i]; stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) goto out_err; stat_data->kvm = kvm; stat_data->desc = pdesc; stat_data->kind = KVM_STAT_VCPU; kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data; debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), kvm->debugfs_dentry, stat_data, &stat_fops_per_vm); } kvm_arch_create_vm_debugfs(kvm); return 0; out_err: kvm_destroy_vm_debugfs(kvm); return ret; } /* * Called after the VM is otherwise initialized, but just before adding it to * the vm_list. */ int __weak kvm_arch_post_init_vm(struct kvm *kvm) { return 0; } /* * Called just after removing the VM from the vm_list, but before doing any * other destruction. */ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) { } /* * Called after per-vm debugfs created. When called kvm->debugfs_dentry should * be setup already, so we can create arch-specific debugfs entries under it. * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so * a per-arch destroy interface is not needed. */ void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) { } static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); struct kvm_memslots *slots; int r, i, j; if (!kvm) return ERR_PTR(-ENOMEM); KVM_MMU_LOCK_INIT(kvm); mmgrab(current->mm); kvm->mm = current->mm; kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); mutex_init(&kvm->slots_arch_lock); spin_lock_init(&kvm->mn_invalidate_lock); rcuwait_init(&kvm->mn_memslots_update_rcuwait); xa_init(&kvm->vcpu_array); #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES xa_init(&kvm->mem_attr_array); #endif INIT_LIST_HEAD(&kvm->gpc_list); spin_lock_init(&kvm->gpc_lock); INIT_LIST_HEAD(&kvm->devices); kvm->max_vcpus = KVM_MAX_VCPUS; BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); /* * Force subsequent debugfs file creations to fail if the VM directory * is not created (by kvm_create_vm_debugfs()). */ kvm->debugfs_dentry = ERR_PTR(-ENOENT); snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d", task_pid_nr(current)); r = -ENOMEM; if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; if (init_srcu_struct(&kvm->irq_srcu)) goto out_err_no_irq_srcu; r = kvm_init_irq_routing(kvm); if (r) goto out_err_no_irq_routing; refcount_set(&kvm->users_count, 1); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { for (j = 0; j < 2; j++) { slots = &kvm->__memslots[i][j]; atomic_long_set(&slots->last_used_slot, (unsigned long)NULL); slots->hva_tree = RB_ROOT_CACHED; slots->gfn_tree = RB_ROOT; hash_init(slots->id_hash); slots->node_idx = j; /* Generations must be different for each address space. */ slots->generation = i; } rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]); } r = -ENOMEM; for (i = 0; i < KVM_NR_BUSES; i++) { rcu_assign_pointer(kvm->buses[i], kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); if (!kvm->buses[i]) goto out_err_no_arch_destroy_vm; } r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_arch_destroy_vm; r = kvm_enable_virtualization(); if (r) goto out_err_no_disable; #ifdef CONFIG_HAVE_KVM_IRQCHIP INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); #endif r = kvm_init_mmu_notifier(kvm); if (r) goto out_err_no_mmu_notifier; r = kvm_coalesced_mmio_init(kvm); if (r < 0) goto out_no_coalesced_mmio; r = kvm_create_vm_debugfs(kvm, fdname); if (r) goto out_err_no_debugfs; r = kvm_arch_post_init_vm(kvm); if (r) goto out_err; mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); mutex_unlock(&kvm_lock); preempt_notifier_inc(); kvm_init_pm_notifier(kvm); return kvm; out_err: kvm_destroy_vm_debugfs(kvm); out_err_no_debugfs: kvm_coalesced_mmio_free(kvm); out_no_coalesced_mmio: #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER if (kvm->mmu_notifier.ops) mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); #endif out_err_no_mmu_notifier: kvm_disable_virtualization(); out_err_no_disable: kvm_arch_destroy_vm(kvm); out_err_no_arch_destroy_vm: WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); kvm_free_irq_routing(kvm); out_err_no_irq_routing: cleanup_srcu_struct(&kvm->irq_srcu); out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); out_err_no_srcu: kvm_arch_free_vm(kvm); mmdrop(current->mm); return ERR_PTR(r); } static void kvm_destroy_devices(struct kvm *kvm) { struct kvm_device *dev, *tmp; /* * We do not need to take the kvm->lock here, because nobody else * has a reference to the struct kvm at this point and therefore * cannot access the devices list anyhow. * * The device list is generally managed as an rculist, but list_del() * is used intentionally here. If a bug in KVM introduced a reader that * was not backed by a reference on the kvm struct, the hope is that * it'd consume the poisoned forward pointer instead of suffering a * use-after-free, even though this cannot be guaranteed. */ list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { list_del(&dev->vm_node); dev->ops->destroy(dev); } } static void kvm_destroy_vm(struct kvm *kvm) { int i; struct mm_struct *mm = kvm->mm; kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); mutex_lock(&kvm_lock); list_del(&kvm->vm_list); mutex_unlock(&kvm_lock); kvm_arch_pre_destroy_vm(kvm); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { struct kvm_io_bus *bus = kvm_get_bus(kvm, i); if (bus) kvm_io_bus_destroy(bus); kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); /* * At this point, pending calls to invalidate_range_start() * have completed but no more MMU notifiers will run, so * mn_active_invalidate_count may remain unbalanced. * No threads can be waiting in kvm_swap_active_memslots() as the * last reference on KVM has been dropped, but freeing * memslots would deadlock without this manual intervention. * * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU * notifier between a start() and end(), then there shouldn't be any * in-progress invalidations. */ WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); if (kvm->mn_active_invalidate_count) kvm->mn_active_invalidate_count = 0; else WARN_ON(kvm->mmu_invalidate_in_progress); #else kvm_flush_shadow_all(kvm); #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { kvm_free_memslots(kvm, &kvm->__memslots[i][0]); kvm_free_memslots(kvm, &kvm->__memslots[i][1]); } cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES xa_destroy(&kvm->mem_attr_array); #endif kvm_arch_free_vm(kvm); preempt_notifier_dec(); kvm_disable_virtualization(); mmdrop(mm); } void kvm_get_kvm(struct kvm *kvm) { refcount_inc(&kvm->users_count); } EXPORT_SYMBOL_GPL(kvm_get_kvm); /* * Make sure the vm is not during destruction, which is a safe version of * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise. */ bool kvm_get_kvm_safe(struct kvm *kvm) { return refcount_inc_not_zero(&kvm->users_count); } EXPORT_SYMBOL_GPL(kvm_get_kvm_safe); void kvm_put_kvm(struct kvm *kvm) { if (refcount_dec_and_test(&kvm->users_count)) kvm_destroy_vm(kvm); } EXPORT_SYMBOL_GPL(kvm_put_kvm); /* * Used to put a reference that was taken on behalf of an object associated * with a user-visible file descriptor, e.g. a vcpu or device, if installation * of the new file descriptor fails and the reference cannot be transferred to * its final owner. In such cases, the caller is still actively using @kvm and * will fail miserably if the refcount unexpectedly hits zero. */ void kvm_put_kvm_no_destroy(struct kvm *kvm) { WARN_ON(refcount_dec_and_test(&kvm->users_count)); } EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); static int kvm_vm_release(struct inode *inode, struct file *filp) { struct kvm *kvm = filp->private_data; kvm_irqfd_release(kvm); kvm_put_kvm(kvm); return 0; } /* * Allocation size is twice as large as the actual dirty bitmap size. * See kvm_vm_ioctl_get_dirty_log() why this is needed. */ static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot); memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT); if (!memslot->dirty_bitmap) return -ENOMEM; return 0; } static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id) { struct kvm_memslots *active = __kvm_memslots(kvm, as_id); int node_idx_inactive = active->node_idx ^ 1; return &kvm->__memslots[as_id][node_idx_inactive]; } /* * Helper to get the address space ID when one of memslot pointers may be NULL. * This also serves as a sanity that at least one of the pointers is non-NULL, * and that their address space IDs don't diverge. */ static int kvm_memslots_get_as_id(struct kvm_memory_slot *a, struct kvm_memory_slot *b) { if (WARN_ON_ONCE(!a && !b)) return 0; if (!a) return b->as_id; if (!b) return a->as_id; WARN_ON_ONCE(a->as_id != b->as_id); return a->as_id; } static void kvm_insert_gfn_node(struct kvm_memslots *slots, struct kvm_memory_slot *slot) { struct rb_root *gfn_tree = &slots->gfn_tree; struct rb_node **node, *parent; int idx = slots->node_idx; parent = NULL; for (node = &gfn_tree->rb_node; *node; ) { struct kvm_memory_slot *tmp; tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]); parent = *node; if (slot->base_gfn < tmp->base_gfn) node = &(*node)->rb_left; else if (slot->base_gfn > tmp->base_gfn) node = &(*node)->rb_right; else BUG(); } rb_link_node(&slot->gfn_node[idx], parent, node); rb_insert_color(&slot->gfn_node[idx], gfn_tree); } static void kvm_erase_gfn_node(struct kvm_memslots *slots, struct kvm_memory_slot *slot) { rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree); } static void kvm_replace_gfn_node(struct kvm_memslots *slots, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { int idx = slots->node_idx; WARN_ON_ONCE(old->base_gfn != new->base_gfn); rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx], &slots->gfn_tree); } /* * Replace @old with @new in the inactive memslots. * * With NULL @old this simply adds @new. * With NULL @new this simply removes @old. * * If @new is non-NULL its hva_node[slots_idx] range has to be set * appropriately. */ static void kvm_replace_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { int as_id = kvm_memslots_get_as_id(old, new); struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); int idx = slots->node_idx; if (old) { hash_del(&old->id_node[idx]); interval_tree_remove(&old->hva_node[idx], &slots->hva_tree); if ((long)old == atomic_long_read(&slots->last_used_slot)) atomic_long_set(&slots->last_used_slot, (long)new); if (!new) { kvm_erase_gfn_node(slots, old); return; } } /* * Initialize @new's hva range. Do this even when replacing an @old * slot, kvm_copy_memslot() deliberately does not touch node data. */ new->hva_node[idx].start = new->userspace_addr; new->hva_node[idx].last = new->userspace_addr + (new->npages << PAGE_SHIFT) - 1; /* * (Re)Add the new memslot. There is no O(1) interval_tree_replace(), * hva_node needs to be swapped with remove+insert even though hva can't * change when replacing an existing slot. */ hash_add(slots->id_hash, &new->id_node[idx], new->id); interval_tree_insert(&new->hva_node[idx], &slots->hva_tree); /* * If the memslot gfn is unchanged, rb_replace_node() can be used to * switch the node in the gfn tree instead of removing the old and * inserting the new as two separate operations. Replacement is a * single O(1) operation versus two O(log(n)) operations for * remove+insert. */ if (old && old->base_gfn == new->base_gfn) { kvm_replace_gfn_node(slots, old, new); } else { if (old) kvm_erase_gfn_node(slots, old); kvm_insert_gfn_node(slots, new); } } /* * Flags that do not access any of the extra space of struct * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS * only allows these. */ #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \ (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) static int check_memory_region_flags(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; if (kvm_arch_has_private_mem(kvm)) valid_flags |= KVM_MEM_GUEST_MEMFD; /* Dirty logging private memory is not currently supported. */ if (mem->flags & KVM_MEM_GUEST_MEMFD) valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; /* * GUEST_MEMFD is incompatible with read-only memslots, as writes to * read-only memslots have emulated MMIO, not page fault, semantics, * and KVM doesn't allow emulated MMIO for private memory. */ if (kvm_arch_has_readonly_mem(kvm) && !(mem->flags & KVM_MEM_GUEST_MEMFD)) valid_flags |= KVM_MEM_READONLY; if (mem->flags & ~valid_flags) return -EINVAL; return 0; } static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) { struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); /* Grab the generation from the activate memslots. */ u64 gen = __kvm_memslots(kvm, as_id)->generation; WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; /* * Do not store the new memslots while there are invalidations in * progress, otherwise the locking in invalidate_range_start and * invalidate_range_end will be unbalanced. */ spin_lock(&kvm->mn_invalidate_lock); prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait); while (kvm->mn_active_invalidate_count) { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&kvm->mn_invalidate_lock); schedule(); spin_lock(&kvm->mn_invalidate_lock); } finish_rcuwait(&kvm->mn_memslots_update_rcuwait); rcu_assign_pointer(kvm->memslots[as_id], slots); spin_unlock(&kvm->mn_invalidate_lock); /* * Acquired in kvm_set_memslot. Must be released before synchronize * SRCU below in order to avoid deadlock with another thread * acquiring the slots_arch_lock in an srcu critical section. */ mutex_unlock(&kvm->slots_arch_lock); synchronize_srcu_expedited(&kvm->srcu); /* * Increment the new memslot generation a second time, dropping the * update in-progress flag and incrementing the generation based on * the number of address spaces. This provides a unique and easily * identifiable generation number while the memslots are in flux. */ gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; /* * Generations must be unique even across address spaces. We do not need * a global counter for that, instead the generation space is evenly split * across address spaces. For example, with two address spaces, address * space 0 will use generations 0, 2, 4, ... while address space 1 will * use generations 1, 3, 5, ... */ gen += kvm_arch_nr_memslot_as_ids(kvm); kvm_arch_memslots_updated(kvm, gen); slots->generation = gen; } static int kvm_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change) { int r; /* * If dirty logging is disabled, nullify the bitmap; the old bitmap * will be freed on "commit". If logging is enabled in both old and * new, reuse the existing bitmap. If logging is enabled only in the * new and KVM isn't using a ring buffer, allocate and initialize a * new bitmap. */ if (change != KVM_MR_DELETE) { if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) new->dirty_bitmap = NULL; else if (old && old->dirty_bitmap) new->dirty_bitmap = old->dirty_bitmap; else if (kvm_use_dirty_bitmap(kvm)) { r = kvm_alloc_dirty_bitmap(new); if (r) return r; if (kvm_dirty_log_manual_protect_and_init_set(kvm)) bitmap_set(new->dirty_bitmap, 0, new->npages); } } r = kvm_arch_prepare_memory_region(kvm, old, new, change); /* Free the bitmap on failure if it was allocated above. */ if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap)) kvm_destroy_dirty_bitmap(new); return r; } static void kvm_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { int old_flags = old ? old->flags : 0; int new_flags = new ? new->flags : 0; /* * Update the total number of memslot pages before calling the arch * hook so that architectures can consume the result directly. */ if (change == KVM_MR_DELETE) kvm->nr_memslot_pages -= old->npages; else if (change == KVM_MR_CREATE) kvm->nr_memslot_pages += new->npages; if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) { int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1; atomic_set(&kvm->nr_memslots_dirty_logging, atomic_read(&kvm->nr_memslots_dirty_logging) + change); } kvm_arch_commit_memory_region(kvm, old, new, change); switch (change) { case KVM_MR_CREATE: /* Nothing more to do. */ break; case KVM_MR_DELETE: /* Free the old memslot and all its metadata. */ kvm_free_memslot(kvm, old); break; case KVM_MR_MOVE: case KVM_MR_FLAGS_ONLY: /* * Free the dirty bitmap as needed; the below check encompasses * both the flags and whether a ring buffer is being used) */ if (old->dirty_bitmap && !new->dirty_bitmap) kvm_destroy_dirty_bitmap(old); /* * The final quirk. Free the detached, old slot, but only its * memory, not any metadata. Metadata, including arch specific * data, may be reused by @new. */ kfree(old); break; default: BUG(); } } /* * Activate @new, which must be installed in the inactive slots by the caller, * by swapping the active slots and then propagating @new to @old once @old is * unreachable and can be safely modified. * * With NULL @old this simply adds @new to @active (while swapping the sets). * With NULL @new this simply removes @old from @active and frees it * (while also swapping the sets). */ static void kvm_activate_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { int as_id = kvm_memslots_get_as_id(old, new); kvm_swap_active_memslots(kvm, as_id); /* Propagate the new memslot to the now inactive memslots. */ kvm_replace_memslot(kvm, old, new); } static void kvm_copy_memslot(struct kvm_memory_slot *dest, const struct kvm_memory_slot *src) { dest->base_gfn = src->base_gfn; dest->npages = src->npages; dest->dirty_bitmap = src->dirty_bitmap; dest->arch = src->arch; dest->userspace_addr = src->userspace_addr; dest->flags = src->flags; dest->id = src->id; dest->as_id = src->as_id; } static void kvm_invalidate_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *invalid_slot) { /* * Mark the current slot INVALID. As with all memslot modifications, * this must be done on an unreachable slot to avoid modifying the * current slot in the active tree. */ kvm_copy_memslot(invalid_slot, old); invalid_slot->flags |= KVM_MEMSLOT_INVALID; kvm_replace_memslot(kvm, old, invalid_slot); /* * Activate the slot that is now marked INVALID, but don't propagate * the slot to the now inactive slots. The slot is either going to be * deleted or recreated as a new slot. */ kvm_swap_active_memslots(kvm, old->as_id); /* * From this point no new shadow pages pointing to a deleted, or moved, * memslot will be created. Validation of sp->gfn happens in: * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) * - kvm_is_visible_gfn (mmu_check_root) */ kvm_arch_flush_shadow_memslot(kvm, old); kvm_arch_guest_memory_reclaimed(kvm); /* Was released by kvm_swap_active_memslots(), reacquire. */ mutex_lock(&kvm->slots_arch_lock); /* * Copy the arch-specific field of the newly-installed slot back to the * old slot as the arch data could have changed between releasing * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock * above. Writers are required to retrieve memslots *after* acquiring * slots_arch_lock, thus the active slot's data is guaranteed to be fresh. */ old->arch = invalid_slot->arch; } static void kvm_create_memslot(struct kvm *kvm, struct kvm_memory_slot *new) { /* Add the new memslot to the inactive set and activate. */ kvm_replace_memslot(kvm, NULL, new); kvm_activate_memslot(kvm, NULL, new); } static void kvm_delete_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *invalid_slot) { /* * Remove the old memslot (in the inactive memslots) by passing NULL as * the "new" slot, and for the invalid version in the active slots. */ kvm_replace_memslot(kvm, old, NULL); kvm_activate_memslot(kvm, invalid_slot, NULL); } static void kvm_move_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new, struct kvm_memory_slot *invalid_slot) { /* * Replace the old memslot in the inactive slots, and then swap slots * and replace the current INVALID with the new as well. */ kvm_replace_memslot(kvm, old, new); kvm_activate_memslot(kvm, invalid_slot, new); } static void kvm_update_flags_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { /* * Similar to the MOVE case, but the slot doesn't need to be zapped as * an intermediate step. Instead, the old memslot is simply replaced * with a new, updated copy in both memslot sets. */ kvm_replace_memslot(kvm, old, new); kvm_activate_memslot(kvm, old, new); } static int kvm_set_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change) { struct kvm_memory_slot *invalid_slot; int r; /* * Released in kvm_swap_active_memslots(). * * Must be held from before the current memslots are copied until after * the new memslots are installed with rcu_assign_pointer, then * released before the synchronize srcu in kvm_swap_active_memslots(). * * When modifying memslots outside of the slots_lock, must be held * before reading the pointer to the current memslots until after all * changes to those memslots are complete. * * These rules ensure that installing new memslots does not lose * changes made to the previous memslots. */ mutex_lock(&kvm->slots_arch_lock); /* * Invalidate the old slot if it's being deleted or moved. This is * done prior to actually deleting/moving the memslot to allow vCPUs to * continue running by ensuring there are no mappings or shadow pages * for the memslot when it is deleted/moved. Without pre-invalidation * (and without a lock), a window would exist between effecting the * delete/move and committing the changes in arch code where KVM or a * guest could access a non-existent memslot. * * Modifications are done on a temporary, unreachable slot. The old * slot needs to be preserved in case a later step fails and the * invalidation needs to be reverted. */ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT); if (!invalid_slot) { mutex_unlock(&kvm->slots_arch_lock); return -ENOMEM; } kvm_invalidate_memslot(kvm, old, invalid_slot); } r = kvm_prepare_memory_region(kvm, old, new, change); if (r) { /* * For DELETE/MOVE, revert the above INVALID change. No * modifications required since the original slot was preserved * in the inactive slots. Changing the active memslots also * release slots_arch_lock. */ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { kvm_activate_memslot(kvm, invalid_slot, old); kfree(invalid_slot); } else { mutex_unlock(&kvm->slots_arch_lock); } return r; } /* * For DELETE and MOVE, the working slot is now active as the INVALID * version of the old slot. MOVE is particularly special as it reuses * the old slot and returns a copy of the old slot (in working_slot). * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the * old slot is detached but otherwise preserved. */ if (change == KVM_MR_CREATE) kvm_create_memslot(kvm, new); else if (change == KVM_MR_DELETE) kvm_delete_memslot(kvm, old, invalid_slot); else if (change == KVM_MR_MOVE) kvm_move_memslot(kvm, old, new, invalid_slot); else if (change == KVM_MR_FLAGS_ONLY) kvm_update_flags_memslot(kvm, old, new); else BUG(); /* Free the temporary INVALID slot used for DELETE and MOVE. */ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) kfree(invalid_slot); /* * No need to refresh new->arch, changes after dropping slots_arch_lock * will directly hit the final, active memslot. Architectures are * responsible for knowing that new->arch may be stale. */ kvm_commit_memory_region(kvm, old, new, change); return 0; } static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, gfn_t start, gfn_t end) { struct kvm_memslot_iter iter; kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) { if (iter.slot->id != id) return true; } return false; } static int kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { struct kvm_memory_slot *old, *new; struct kvm_memslots *slots; enum kvm_mr_change change; unsigned long npages; gfn_t base_gfn; int as_id, id; int r; lockdep_assert_held(&kvm->slots_lock); r = check_memory_region_flags(kvm, mem); if (r) return r; as_id = mem->slot >> 16; id = (u16)mem->slot; /* General sanity checks */ if ((mem->memory_size & (PAGE_SIZE - 1)) || (mem->memory_size != (unsigned long)mem->memory_size)) return -EINVAL; if (mem->guest_phys_addr & (PAGE_SIZE - 1)) return -EINVAL; /* We can read the guest memory with __xxx_user() later on. */ if ((mem->userspace_addr & (PAGE_SIZE - 1)) || (mem->userspace_addr != untagged_addr(mem->userspace_addr)) || !access_ok((void __user *)(unsigned long)mem->userspace_addr, mem->memory_size)) return -EINVAL; if (mem->flags & KVM_MEM_GUEST_MEMFD && (mem->guest_memfd_offset & (PAGE_SIZE - 1) || mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset)) return -EINVAL; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM) return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) return -EINVAL; if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) return -EINVAL; slots = __kvm_memslots(kvm, as_id); /* * Note, the old memslot (and the pointer itself!) may be invalidated * and/or destroyed by kvm_set_memslot(). */ old = id_to_memslot(slots, id); if (!mem->memory_size) { if (!old || !old->npages) return -EINVAL; if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages)) return -EIO; return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE); } base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT); npages = (mem->memory_size >> PAGE_SHIFT); if (!old || !old->npages) { change = KVM_MR_CREATE; /* * To simplify KVM internals, the total number of pages across * all memslots must fit in an unsigned long. */ if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) return -EINVAL; } else { /* Modify an existing slot. */ /* Private memslots are immutable, they can only be deleted. */ if (mem->flags & KVM_MEM_GUEST_MEMFD) return -EINVAL; if ((mem->userspace_addr != old->userspace_addr) || (npages != old->npages) || ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) return -EINVAL; if (base_gfn != old->base_gfn) change = KVM_MR_MOVE; else if (mem->flags != old->flags) change = KVM_MR_FLAGS_ONLY; else /* Nothing to change. */ return 0; } if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) && kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages)) return -EEXIST; /* Allocate a slot that will persist in the memslot. */ new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; new->as_id = as_id; new->id = id; new->base_gfn = base_gfn; new->npages = npages; new->flags = mem->flags; new->userspace_addr = mem->userspace_addr; if (mem->flags & KVM_MEM_GUEST_MEMFD) { r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset); if (r) goto out; } r = kvm_set_memslot(kvm, old, new, change); if (r) goto out_unbind; return 0; out_unbind: if (mem->flags & KVM_MEM_GUEST_MEMFD) kvm_gmem_unbind(new); out: kfree(new); return r; } int kvm_set_internal_memslot(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { if (WARN_ON_ONCE(mem->slot < KVM_USER_MEM_SLOTS)) return -EINVAL; if (WARN_ON_ONCE(mem->flags)) return -EINVAL; return kvm_set_memory_region(kvm, mem); } EXPORT_SYMBOL_GPL(kvm_set_internal_memslot); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region2 *mem) { if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; guard(mutex)(&kvm->slots_lock); return kvm_set_memory_region(kvm, mem); } #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** * kvm_get_dirty_log - get a snapshot of dirty pages * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * @is_dirty: set to '1' if any dirty pages were found * @memslot: set to the associated memslot, always valid on success */ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty, struct kvm_memory_slot **memslot) { struct kvm_memslots *slots; int i, as_id, id; unsigned long n; unsigned long any = 0; /* Dirty ring tracking may be exclusive to dirty log tracking */ if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; *memslot = NULL; *is_dirty = 0; as_id = log->slot >> 16; id = (u16)log->slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); *memslot = id_to_memslot(slots, id); if (!(*memslot) || !(*memslot)->dirty_bitmap) return -ENOENT; kvm_arch_sync_dirty_log(kvm, *memslot); n = kvm_dirty_bitmap_bytes(*memslot); for (i = 0; !any && i < n/sizeof(long); ++i) any = (*memslot)->dirty_bitmap[i]; if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n)) return -EFAULT; if (any) *is_dirty = 1; return 0; } EXPORT_SYMBOL_GPL(kvm_get_dirty_log); #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ /** * kvm_get_dirty_log_protect - get a snapshot of dirty pages * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * * We need to keep it in mind that VCPU threads can write to the bitmap * concurrently. So, to avoid losing track of dirty pages we keep the * following order: * * 1. Take a snapshot of the bit and clear it if needed. * 2. Write protect the corresponding page. * 3. Copy the snapshot to the userspace. * 4. Upon return caller flushes TLB's if needed. * * Between 2 and 4, the guest may write to the page using the remaining TLB * entry. This is not a problem because the page is reported dirty using * the snapshot taken before and step 4 ensures that writes done after * exiting to userspace will be logged for the next call. * */ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i, as_id, id; unsigned long n; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; bool flush; /* Dirty ring tracking may be exclusive to dirty log tracking */ if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; as_id = log->slot >> 16; id = (u16)log->slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); memslot = id_to_memslot(slots, id); if (!memslot || !memslot->dirty_bitmap) return -ENOENT; dirty_bitmap = memslot->dirty_bitmap; kvm_arch_sync_dirty_log(kvm, memslot); n = kvm_dirty_bitmap_bytes(memslot); flush = false; if (kvm->manual_dirty_log_protect) { /* * Unlike kvm_get_dirty_log, we always return false in *flush, * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There * is some code duplication between this function and * kvm_get_dirty_log, but hopefully all architecture * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log * can be eliminated. */ dirty_bitmap_buffer = dirty_bitmap; } else { dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); memset(dirty_bitmap_buffer, 0, n); KVM_MMU_LOCK(kvm); for (i = 0; i < n / sizeof(long); i++) { unsigned long mask; gfn_t offset; if (!dirty_bitmap[i]) continue; flush = true; mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask; offset = i * BITS_PER_LONG; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } KVM_MMU_UNLOCK(kvm); } if (flush) kvm_flush_remote_tlbs_memslot(kvm, memslot); if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) return -EFAULT; return 0; } /** * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot * @kvm: kvm instance * @log: slot id and address to which we copy the log * * Steps 1-4 below provide general overview of dirty page logging. See * kvm_get_dirty_log_protect() function description for additional details. * * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we * always flush the TLB (step 4) even if previous step failed and the dirty * bitmap may be corrupt. Regardless of previous outcome the KVM logging API * does not preclude user space subsequent dirty log read. Flushing TLB ensures * writes will be marked dirty for next log read. * * 1. Take a snapshot of the bit and clear it if needed. * 2. Write protect the corresponding page. * 3. Copy the snapshot to the userspace. * 4. Flush TLB's if needed. */ static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { int r; mutex_lock(&kvm->slots_lock); r = kvm_get_dirty_log_protect(kvm, log); mutex_unlock(&kvm->slots_lock); return r; } /** * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address from which to fetch the bitmap of dirty pages */ static int kvm_clear_dirty_log_protect(struct kvm *kvm, struct kvm_clear_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int as_id, id; gfn_t offset; unsigned long i, n; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; bool flush; /* Dirty ring tracking may be exclusive to dirty log tracking */ if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; as_id = log->slot >> 16; id = (u16)log->slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; if (log->first_page & 63) return -EINVAL; slots = __kvm_memslots(kvm, as_id); memslot = id_to_memslot(slots, id); if (!memslot || !memslot->dirty_bitmap) return -ENOENT; dirty_bitmap = memslot->dirty_bitmap; n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; if (log->first_page > memslot->npages || log->num_pages > memslot->npages - log->first_page || (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) return -EINVAL; kvm_arch_sync_dirty_log(kvm, memslot); flush = false; dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) return -EFAULT; KVM_MMU_LOCK(kvm); for (offset = log->first_page, i = offset / BITS_PER_LONG, n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; i++, offset += BITS_PER_LONG) { unsigned long mask = *dirty_bitmap_buffer++; atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; if (!mask) continue; mask &= atomic_long_fetch_andnot(mask, p); /* * mask contains the bits that really have been cleared. This * never includes any bits beyond the length of the memslot (if * the length is not aligned to 64 pages), therefore it is not * a problem if userspace sets them in log->dirty_bitmap. */ if (mask) { flush = true; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } } KVM_MMU_UNLOCK(kvm); if (flush) kvm_flush_remote_tlbs_memslot(kvm, memslot); return 0; } static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) { int r; mutex_lock(&kvm->slots_lock); r = kvm_clear_dirty_log_protect(kvm, log); mutex_unlock(&kvm->slots_lock); return r; } #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static u64 kvm_supported_mem_attributes(struct kvm *kvm) { if (!kvm || kvm_arch_has_private_mem(kvm)) return KVM_MEMORY_ATTRIBUTE_PRIVATE; return 0; } /* * Returns true if _all_ gfns in the range [@start, @end) have attributes * such that the bits in @mask match @attrs. */ bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, unsigned long mask, unsigned long attrs) { XA_STATE(xas, &kvm->mem_attr_array, start); unsigned long index; void *entry; mask &= kvm_supported_mem_attributes(kvm); if (attrs & ~mask) return false; if (end == start + 1) return (kvm_get_memory_attributes(kvm, start) & mask) == attrs; guard(rcu)(); if (!attrs) return !xas_find(&xas, end - 1); for (index = start; index < end; index++) { do { entry = xas_next(&xas); } while (xas_retry(&xas, entry)); if (xas.xa_index != index || (xa_to_value(entry) & mask) != attrs) return false; } return true; } static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, struct kvm_mmu_notifier_range *range) { struct kvm_gfn_range gfn_range; struct kvm_memory_slot *slot; struct kvm_memslots *slots; struct kvm_memslot_iter iter; bool found_memslot = false; bool ret = false; int i; gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; /* * If/when KVM supports more attributes beyond private .vs shared, this * _could_ set KVM_FILTER_{SHARED,PRIVATE} appropriately if the entire target * range already has the desired private vs. shared state (it's unclear * if that is a net win). For now, KVM reaches this point if and only * if the private flag is being toggled, i.e. all mappings are in play. */ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) { slot = iter.slot; gfn_range.slot = slot; gfn_range.start = max(range->start, slot->base_gfn); gfn_range.end = min(range->end, slot->base_gfn + slot->npages); if (gfn_range.start >= gfn_range.end) continue; if (!found_memslot) { found_memslot = true; KVM_MMU_LOCK(kvm); if (!IS_KVM_NULL_FN(range->on_lock)) range->on_lock(kvm); } ret |= range->handler(kvm, &gfn_range); } } if (range->flush_on_ret && ret) kvm_flush_remote_tlbs(kvm); if (found_memslot) KVM_MMU_UNLOCK(kvm); } static bool kvm_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { /* * Unconditionally add the range to the invalidation set, regardless of * whether or not the arch callback actually needs to zap SPTEs. E.g. * if KVM supports RWX attributes in the future and the attributes are * going from R=>RW, zapping isn't strictly necessary. Unconditionally * adding the range allows KVM to require that MMU invalidations add at * least one range between begin() and end(), e.g. allows KVM to detect * bugs where the add() is missed. Relaxing the rule *might* be safe, * but it's not obvious that allowing new mappings while the attributes * are in flux is desirable or worth the complexity. */ kvm_mmu_invalidate_range_add(kvm, range->start, range->end); return kvm_arch_pre_set_memory_attributes(kvm, range); } /* Set @attributes for the gfn range [@start, @end). */ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, unsigned long attributes) { struct kvm_mmu_notifier_range pre_set_range = { .start = start, .end = end, .arg.attributes = attributes, .handler = kvm_pre_set_memory_attributes, .on_lock = kvm_mmu_invalidate_begin, .flush_on_ret = true, .may_block = true, }; struct kvm_mmu_notifier_range post_set_range = { .start = start, .end = end, .arg.attributes = attributes, .handler = kvm_arch_post_set_memory_attributes, .on_lock = kvm_mmu_invalidate_end, .may_block = true, }; unsigned long i; void *entry; int r = 0; entry = attributes ? xa_mk_value(attributes) : NULL; mutex_lock(&kvm->slots_lock); /* Nothing to do if the entire range as the desired attributes. */ if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes)) goto out_unlock; /* * Reserve memory ahead of time to avoid having to deal with failures * partway through setting the new attributes. */ for (i = start; i < end; i++) { r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT); if (r) goto out_unlock; } kvm_handle_gfn_range(kvm, &pre_set_range); for (i = start; i < end; i++) { r = xa_err(xa_store(&kvm->mem_attr_array, i, entry, GFP_KERNEL_ACCOUNT)); KVM_BUG_ON(r, kvm); } kvm_handle_gfn_range(kvm, &post_set_range); out_unlock: mutex_unlock(&kvm->slots_lock); return r; } static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, struct kvm_memory_attributes *attrs) { gfn_t start, end; /* flags is currently not used. */ if (attrs->flags) return -EINVAL; if (attrs->attributes & ~kvm_supported_mem_attributes(kvm)) return -EINVAL; if (attrs->size == 0 || attrs->address + attrs->size < attrs->address) return -EINVAL; if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size)) return -EINVAL; start = attrs->address >> PAGE_SHIFT; end = (attrs->address + attrs->size) >> PAGE_SHIFT; /* * xarray tracks data using "unsigned long", and as a result so does * KVM. For simplicity, supports generic attributes only on 64-bit * architectures. */ BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long)); return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes); } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); } EXPORT_SYMBOL_GPL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); u64 gen = slots->generation; struct kvm_memory_slot *slot; /* * This also protects against using a memslot from a different address space, * since different address spaces have different generation numbers. */ if (unlikely(gen != vcpu->last_used_slot_gen)) { vcpu->last_used_slot = NULL; vcpu->last_used_slot_gen = gen; } slot = try_get_memslot(vcpu->last_used_slot, gfn); if (slot) return slot; /* * Fall back to searching all memslots. We purposely use * search_memslots() instead of __gfn_to_memslot() to avoid * thrashing the VM-wide last_used_slot in kvm_memslots. */ slot = search_memslots(slots, gfn, false); if (slot) { vcpu->last_used_slot = slot; return slot; } return NULL; } bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); return kvm_is_visible_memslot(memslot); } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return kvm_is_visible_memslot(memslot); } EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr, size; size = PAGE_SIZE; addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); if (kvm_is_error_hva(addr)) return PAGE_SIZE; mmap_read_lock(current->mm); vma = find_vma(current->mm, addr); if (!vma) goto out; size = vma_kernel_pagesize(vma); out: mmap_read_unlock(current->mm); return size; } static bool memslot_is_readonly(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_READONLY; } static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages, bool write) { if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return KVM_HVA_ERR_BAD; if (memslot_is_readonly(slot) && write) return KVM_HVA_ERR_RO_BAD; if (nr_pages) *nr_pages = slot->npages - (gfn - slot->base_gfn); return __gfn_to_hva_memslot(slot, gfn); } static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages) { return __gfn_to_hva_many(slot, gfn, nr_pages, true); } unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { return gfn_to_hva_many(slot, gfn, NULL); } EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); } EXPORT_SYMBOL_GPL(gfn_to_hva); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) { return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); } EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); /* * Return the hva of a @gfn and the R/W attribute if possible. * * @slot: the kvm_memory_slot which contains @gfn * @gfn: the gfn to be translated * @writable: used to return the read/write attribute of the @slot if the hva * is valid and @writable is not NULL */ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable) { unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); if (!kvm_is_error_hva(hva) && writable) *writable = !memslot_is_readonly(slot); return hva; } unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); return gfn_to_hva_memslot_prot(slot, gfn, writable); } unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return gfn_to_hva_memslot_prot(slot, gfn, writable); } static bool kvm_is_ad_tracked_page(struct page *page) { /* * Per page-flags.h, pages tagged PG_reserved "should in general not be * touched (e.g. set dirty) except by its owner". */ return !PageReserved(page); } static void kvm_set_page_dirty(struct page *page) { if (kvm_is_ad_tracked_page(page)) SetPageDirty(page); } static void kvm_set_page_accessed(struct page *page) { if (kvm_is_ad_tracked_page(page)) mark_page_accessed(page); } void kvm_release_page_clean(struct page *page) { if (!page) return; kvm_set_page_accessed(page); put_page(page); } EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_page_dirty(struct page *page) { if (!page) return; kvm_set_page_dirty(page); kvm_release_page_clean(page); } EXPORT_SYMBOL_GPL(kvm_release_page_dirty); static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, struct follow_pfnmap_args *map, bool writable) { kvm_pfn_t pfn; WARN_ON_ONCE(!!page == !!map); if (kfp->map_writable) *kfp->map_writable = writable; if (map) pfn = map->pfn; else pfn = page_to_pfn(page); *kfp->refcounted_page = page; return pfn; } /* * The fast path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. */ static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { struct page *page; bool r; /* * Try the fast-only path when the caller wants to pin/get the page for * writing. If the caller only wants to read the page, KVM must go * down the full, slow path in order to avoid racing an operation that * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing * at the old, read-only page while mm/ points at a new, writable page. */ if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; if (kfp->pin) r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1; else r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page); if (r) { *pfn = kvm_resolve_pfn(kfp, page, NULL, true); return true; } return false; } /* * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { /* * When a VCPU accesses a page that is not mapped into the secondary * MMU, we lookup the page using GUP to map it, so the guest VCPU can * make progress. We always want to honor NUMA hinting faults in that * case, because GUP usage corresponds to memory accesses from the VCPU. * Otherwise, we'd not trigger NUMA hinting faults once a page is * mapped into the secondary MMU and gets accessed by a VCPU. * * Note that get_user_page_fast_only() and FOLL_WRITE for now * implicitly honor NUMA hinting faults and don't need this flag. */ unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags; struct page *page, *wpage; int npages; if (kfp->pin) npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags); else npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); if (npages != 1) return npages; /* * Pinning is mutually exclusive with opportunistically mapping a read * fault as writable, as KVM should never pin pages when mapping memory * into the guest (pinning is only for direct accesses from KVM). */ if (WARN_ON_ONCE(kfp->map_writable && kfp->pin)) goto out; /* map read fault as writable if possible */ if (!(flags & FOLL_WRITE) && kfp->map_writable && get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { put_page(page); page = wpage; flags |= FOLL_WRITE; } out: *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) { if (unlikely(!(vma->vm_flags & VM_READ))) return false; if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) return false; return true; } static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn) { struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva }; bool write_fault = kfp->flags & FOLL_WRITE; int r; /* * Remapped memory cannot be pinned in any meaningful sense. Bail if * the caller wants to pin the page, i.e. access the page outside of * MMU notifier protection, and unsafe umappings are disallowed. */ if (kfp->pin && !allow_unsafe_mappings) return -EINVAL; r = follow_pfnmap_start(&args); if (r) { /* * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does * not call the fault handler, so do it here. */ bool unlocked = false; r = fixup_user_fault(current->mm, kfp->hva, (write_fault ? FAULT_FLAG_WRITE : 0), &unlocked); if (unlocked) return -EAGAIN; if (r) return r; r = follow_pfnmap_start(&args); if (r) return r; } if (write_fault && !args.writable) { *p_pfn = KVM_PFN_ERR_RO_FAULT; goto out; } *p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable); out: follow_pfnmap_end(&args); return r; } kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) { struct vm_area_struct *vma; kvm_pfn_t pfn; int npages, r; might_sleep(); if (WARN_ON_ONCE(!kfp->refcounted_page)) return KVM_PFN_ERR_FAULT; if (hva_to_pfn_fast(kfp, &pfn)) return pfn; npages = hva_to_pfn_slow(kfp, &pfn); if (npages == 1) return pfn; if (npages == -EINTR || npages == -EAGAIN) return KVM_PFN_ERR_SIGPENDING; if (npages == -EHWPOISON) return KVM_PFN_ERR_HWPOISON; mmap_read_lock(current->mm); retry: vma = vma_lookup(current->mm, kfp->hva); if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { r = hva_to_pfn_remapped(vma, kfp, &pfn); if (r == -EAGAIN) goto retry; if (r < 0) pfn = KVM_PFN_ERR_FAULT; } else { if ((kfp->flags & FOLL_NOWAIT) && vma_is_valid(vma, kfp->flags & FOLL_WRITE)) pfn = KVM_PFN_ERR_NEEDS_IO; else pfn = KVM_PFN_ERR_FAULT; } mmap_read_unlock(current->mm); return pfn; } static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp) { kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL, kfp->flags & FOLL_WRITE); if (kfp->hva == KVM_HVA_ERR_RO_BAD) return KVM_PFN_ERR_RO_FAULT; if (kvm_is_error_hva(kfp->hva)) return KVM_PFN_NOSLOT; if (memslot_is_readonly(kfp->slot) && kfp->map_writable) { *kfp->map_writable = false; kfp->map_writable = NULL; } return hva_to_pfn(kfp); } kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, unsigned int foll, bool *writable, struct page **refcounted_page) { struct kvm_follow_pfn kfp = { .slot = slot, .gfn = gfn, .flags = foll, .map_writable = writable, .refcounted_page = refcounted_page, }; if (WARN_ON_ONCE(!writable || !refcounted_page)) return KVM_PFN_ERR_FAULT; *writable = false; *refcounted_page = NULL; return kvm_follow_pfn(&kfp); } EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages) { unsigned long addr; gfn_t entry = 0; addr = gfn_to_hva_many(slot, gfn, &entry); if (kvm_is_error_hva(addr)) return -1; if (entry < nr_pages) return 0; return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } EXPORT_SYMBOL_GPL(kvm_prefetch_pages); /* * Don't use this API unless you are absolutely, positively certain that KVM * needs to get a struct page, e.g. to pin the page for firmware DMA. * * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate * its refcount. */ struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write) { struct page *refcounted_page = NULL; struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(kvm, gfn), .gfn = gfn, .flags = write ? FOLL_WRITE : 0, .refcounted_page = &refcounted_page, }; (void)kvm_follow_pfn(&kfp); return refcounted_page; } EXPORT_SYMBOL_GPL(__gfn_to_page); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, bool writable) { struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(vcpu->kvm, gfn), .gfn = gfn, .flags = writable ? FOLL_WRITE : 0, .refcounted_page = &map->pinned_page, .pin = true, }; map->pinned_page = NULL; map->page = NULL; map->hva = NULL; map->gfn = gfn; map->writable = writable; map->pfn = kvm_follow_pfn(&kfp); if (is_error_noslot_pfn(map->pfn)) return -EINVAL; if (pfn_valid(map->pfn)) { map->page = pfn_to_page(map->pfn); map->hva = kmap(map->page); #ifdef CONFIG_HAS_IOMEM } else { map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB); #endif } return map->hva ? 0 : -EFAULT; } EXPORT_SYMBOL_GPL(__kvm_vcpu_map); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { if (!map->hva) return; if (map->page) kunmap(map->page); #ifdef CONFIG_HAS_IOMEM else memunmap(map->hva); #endif if (map->writable) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); if (map->pinned_page) { if (map->writable) kvm_set_page_dirty(map->pinned_page); kvm_set_page_accessed(map->pinned_page); unpin_user_page(map->pinned_page); } map->hva = NULL; map->page = NULL; map->pinned_page = NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); static int next_segment(unsigned long len, int offset) { if (len > PAGE_SIZE - offset) return PAGE_SIZE - offset; else return len; } /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */ static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, int len) { int r; unsigned long addr; if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) return -EFAULT; addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; r = __copy_from_user(data, (void __user *)addr + offset, len); if (r) return -EFAULT; return 0; } int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len) { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); return __kvm_read_guest_page(slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_page); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return __kvm_read_guest_page(slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_read_guest); int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, unsigned long len) { int r; unsigned long addr; if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) return -EFAULT; addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; pagefault_disable(); r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); pagefault_enable(); if (r) return -EFAULT; return 0; } int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); int offset = offset_in_page(gpa); return __kvm_read_guest_atomic(slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */ static int __kvm_write_guest_page(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn, const void *data, int offset, int len) { int r; unsigned long addr; if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) return -EFAULT; addr = gfn_to_hva_memslot(memslot, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; r = __copy_to_user((void __user *)addr + offset, data, len); if (r) return -EFAULT; mark_page_dirty_in_slot(kvm, memslot, gfn); return 0; } int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len) { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_write_guest_page); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_write_guest); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { int offset = offset_in_page(gpa); gfn_t start_gfn = gpa >> PAGE_SHIFT; gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; gfn_t nr_pages_needed = end_gfn - start_gfn + 1; gfn_t nr_pages_avail; /* Update ghc->generation before performing any error checks. */ ghc->generation = slots->generation; if (start_gfn > end_gfn) { ghc->hva = KVM_HVA_ERR_BAD; return -EINVAL; } /* * If the requested region crosses two memslots, we still * verify that the entire region is valid here. */ for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { ghc->memslot = __gfn_to_memslot(slots, start_gfn); ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail); if (kvm_is_error_hva(ghc->hva)) return -EFAULT; } /* Use the slow path for cross page reads and writes. */ if (nr_pages_needed == 1) ghc->hva += offset; else ghc->memslot = NULL; ghc->gpa = gpa; ghc->len = len; return 0; } int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); } EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; gpa_t gpa = ghc->gpa + offset; if (WARN_ON_ONCE(len + offset > ghc->len)) return -EINVAL; if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) return -EFAULT; } if (kvm_is_error_hva(ghc->hva)) return -EFAULT; if (unlikely(!ghc->memslot)) return kvm_write_guest(kvm, gpa, data, len); r = __copy_to_user((void __user *)ghc->hva + offset, data, len); if (r) return -EFAULT; mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT); return 0; } EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); } EXPORT_SYMBOL_GPL(kvm_write_guest_cached); int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; gpa_t gpa = ghc->gpa + offset; if (WARN_ON_ONCE(len + offset > ghc->len)) return -EINVAL; if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) return -EFAULT; } if (kvm_is_error_hva(ghc->hva)) return -EFAULT; if (unlikely(!ghc->memslot)) return kvm_read_guest(kvm, gpa, data, len); r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); if (r) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_cached); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) { const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_clear_guest); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); #ifdef CONFIG_HAVE_KVM_DIRTY_RING if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm)) return; WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm)); #endif if (memslot && kvm_slot_dirty_track_enabled(memslot)) { unsigned long rel_gfn = gfn - memslot->base_gfn; u32 slot = (memslot->as_id << 16) | memslot->id; if (kvm->dirty_ring_size && vcpu) kvm_dirty_ring_push(vcpu, slot, rel_gfn); else if (memslot->dirty_bitmap) set_bit_le(rel_gfn, memslot->dirty_bitmap); } } EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot); void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot; memslot = gfn_to_memslot(kvm, gfn); mark_page_dirty_in_slot(kvm, memslot, gfn); } EXPORT_SYMBOL_GPL(mark_page_dirty); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_memory_slot *memslot; memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn); } EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); void kvm_sigset_activate(struct kvm_vcpu *vcpu) { if (!vcpu->sigset_active) return; /* * This does a lockless modification of ->real_blocked, which is fine * because, only current can change ->real_blocked and all readers of * ->real_blocked don't care as long ->real_blocked is always a subset * of ->blocked. */ sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked); } void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) { if (!vcpu->sigset_active) return; sigprocmask(SIG_SETMASK, &current->real_blocked, NULL); sigemptyset(&current->real_blocked); } static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) { unsigned int old, val, grow, grow_start; old = val = vcpu->halt_poll_ns; grow_start = READ_ONCE(halt_poll_ns_grow_start); grow = READ_ONCE(halt_poll_ns_grow); if (!grow) goto out; val *= grow; if (val < grow_start) val = grow_start; vcpu->halt_poll_ns = val; out: trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); } static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) { unsigned int old, val, shrink, grow_start; old = val = vcpu->halt_poll_ns; shrink = READ_ONCE(halt_poll_ns_shrink); grow_start = READ_ONCE(halt_poll_ns_grow_start); if (shrink == 0) val = 0; else val /= shrink; if (val < grow_start) val = 0; vcpu->halt_poll_ns = val; trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); } static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) { int ret = -EINTR; int idx = srcu_read_lock(&vcpu->kvm->srcu); if (kvm_arch_vcpu_runnable(vcpu)) goto out; if (kvm_cpu_has_pending_timer(vcpu)) goto out; if (signal_pending(current)) goto out; if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu)) goto out; ret = 0; out: srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } /* * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is * pending. This is mostly used when halting a vCPU, but may also be used * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI. */ bool kvm_vcpu_block(struct kvm_vcpu *vcpu) { struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); bool waited = false; vcpu->stat.generic.blocking = 1; preempt_disable(); kvm_arch_vcpu_blocking(vcpu); prepare_to_rcuwait(wait); preempt_enable(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (kvm_vcpu_check_block(vcpu) < 0) break; waited = true; schedule(); } preempt_disable(); finish_rcuwait(wait); kvm_arch_vcpu_unblocking(vcpu); preempt_enable(); vcpu->stat.generic.blocking = 0; return waited; } static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start, ktime_t end, bool success) { struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic; u64 poll_ns = ktime_to_ns(ktime_sub(end, start)); ++vcpu->stat.generic.halt_attempted_poll; if (success) { ++vcpu->stat.generic.halt_successful_poll; if (!vcpu_valid_wakeup(vcpu)) ++vcpu->stat.generic.halt_poll_invalid; stats->halt_poll_success_ns += poll_ns; KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns); } else { stats->halt_poll_fail_ns += poll_ns; KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns); } } static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; if (kvm->override_halt_poll_ns) { /* * Ensure kvm->max_halt_poll_ns is not read before * kvm->override_halt_poll_ns. * * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL. */ smp_rmb(); return READ_ONCE(kvm->max_halt_poll_ns); } return READ_ONCE(halt_poll_ns); } /* * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt * polling is enabled, busy wait for a short time before blocking to avoid the * expensive block+unblock sequence if a wake event arrives soon after the vCPU * is halted. */ void kvm_vcpu_halt(struct kvm_vcpu *vcpu) { unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu); bool halt_poll_allowed = !kvm_arch_no_poll(vcpu); ktime_t start, cur, poll_end; bool waited = false; bool do_halt_poll; u64 halt_ns; if (vcpu->halt_poll_ns > max_halt_poll_ns) vcpu->halt_poll_ns = max_halt_poll_ns; do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns; start = cur = poll_end = ktime_get(); if (do_halt_poll) { ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns); do { if (kvm_vcpu_check_block(vcpu) < 0) goto out; cpu_relax(); poll_end = cur = ktime_get(); } while (kvm_vcpu_can_poll(cur, stop)); } waited = kvm_vcpu_block(vcpu); cur = ktime_get(); if (waited) { vcpu->stat.generic.halt_wait_ns += ktime_to_ns(cur) - ktime_to_ns(poll_end); KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist, ktime_to_ns(cur) - ktime_to_ns(poll_end)); } out: /* The total time the vCPU was "halted", including polling time. */ halt_ns = ktime_to_ns(cur) - ktime_to_ns(start); /* * Note, halt-polling is considered successful so long as the vCPU was * never actually scheduled out, i.e. even if the wake event arrived * after of the halt-polling loop itself, but before the full wait. */ if (do_halt_poll) update_halt_poll_stats(vcpu, start, poll_end, !waited); if (halt_poll_allowed) { /* Recompute the max halt poll time in case it changed. */ max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu); if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); } else if (max_halt_poll_ns) { if (halt_ns <= vcpu->halt_poll_ns) ; /* we had a long block, shrink polling */ else if (vcpu->halt_poll_ns && halt_ns > max_halt_poll_ns) shrink_halt_poll_ns(vcpu); /* we had a short halt and our poll time is too small */ else if (vcpu->halt_poll_ns < max_halt_poll_ns && halt_ns < max_halt_poll_ns) grow_halt_poll_ns(vcpu); } else { vcpu->halt_poll_ns = 0; } } trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu)); } EXPORT_SYMBOL_GPL(kvm_vcpu_halt); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { if (__kvm_vcpu_wake_up(vcpu)) { WRITE_ONCE(vcpu->ready, true); ++vcpu->stat.generic.halt_wakeup; return true; } return false; } EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); #ifndef CONFIG_S390 /* * Kick a sleeping VCPU, or a gues